cricos_scrape 2.0 → 2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +1 -1
- data/bin/cricos_scrape +40 -0
- data/cricos_scrape.gemspec +7 -9
- data/lib/cricos_scrape/bulk_import_courses.rb +2 -2
- data/lib/cricos_scrape/bulk_import_institutions.rb +2 -2
- data/lib/cricos_scrape/entities/address.rb +4 -0
- data/lib/cricos_scrape/entities/contact.rb +4 -0
- data/lib/cricos_scrape/entities/contact_officer.rb +4 -0
- data/lib/cricos_scrape/entities/course.rb +4 -0
- data/lib/cricos_scrape/entities/institution.rb +4 -0
- data/lib/cricos_scrape/entities/location.rb +4 -0
- data/lib/cricos_scrape/import_contacts.rb +2 -2
- data/lib/cricos_scrape/importer/contact_importer.rb +120 -0
- data/lib/cricos_scrape/importer/course_importer.rb +291 -0
- data/lib/cricos_scrape/importer/institution_importer.rb +279 -0
- data/lib/cricos_scrape/version.rb +1 -1
- data/lib/cricos_scrape.rb +4 -5
- metadata +16 -44
- data/CONTRIBUTING.md +0 -51
- data/Gemfile +0 -2
- data/Gemfile.lock +0 -64
- data/Procfile +0 -3
- data/Rakefile +0 -13
- data/spec/contact_importer_spec.rb +0 -76
- data/spec/course_importer_spec.rb +0 -71
- data/spec/fixtures/contact_details_of_state_act_uri.html +0 -546
- data/spec/fixtures/contact_details_of_state_wa_uri.html +0 -546
- data/spec/fixtures/course_details_with_contact_officers_table_grid.html +0 -467
- data/spec/fixtures/course_details_without_pagination_uri.html +0 -470
- data/spec/fixtures/courses_list_by_location_id_uri.html +0 -174
- data/spec/fixtures/institution_details_with_pagination_location_page_1_uri.html +0 -406
- data/spec/fixtures/institution_details_with_pagination_location_page_2_uri.html +0 -358
- data/spec/fixtures/institution_details_with_po_box_postal_address.html +0 -240
- data/spec/fixtures/institution_details_with_trading_name.html +0 -322
- data/spec/fixtures/institution_details_without_locations_details_uri.html +0 -151
- data/spec/fixtures/institution_details_without_pagination_location_uri.html +0 -299
- data/spec/fixtures/not_found_course_details_uri.html +0 -837
- data/spec/fixtures/not_found_institution_details.html +0 -36
- data/spec/institution_importer_spec.rb +0 -138
- data/spec/spec_helper.rb +0 -67
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 3961f2418e9bc173a4412c6d16a83408dfe2c280
|
4
|
+
data.tar.gz: fdf7c3c46869cd7e805a05e246b36649cacfaa19
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 698d1a4564472abee988d847e06c93f93f718102036b5eb3ddd68a795c86b6827302b21da9db2490b7a647922227d38da9551a76430ff593ffc9c7eed6a848e6
|
7
|
+
data.tar.gz: c4a81a6e2d005d3047f2170a85e66c4c10bdaf73196b068e5ddeb2a3fe0bfd14f1fe4a9a5e7d263aad08beb314c2532d41e4b16524aee47c3e5813a755a7f772
|
data/README.md
CHANGED
@@ -8,7 +8,7 @@
|
|
8
8
|
CRICOS lacks API for data retrieval (so are many government-based services). This gem
|
9
9
|
helps scrape data from [http://cricos.education.gov.au](http://cricos.education.gov.au).
|
10
10
|
|
11
|
-
This gem supports Ruby 2.
|
11
|
+
This gem supports MRI Ruby 2.0.0+.
|
12
12
|
|
13
13
|
# Features
|
14
14
|
|
data/bin/cricos_scrape
ADDED
@@ -0,0 +1,40 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'cricos_scrape'
|
4
|
+
require 'cricos_scrape/version'
|
5
|
+
require 'commander/import'
|
6
|
+
|
7
|
+
program :version, CricosScrape::VERSION.to_s
|
8
|
+
program :description, 'Scrape data from CRICOS website'
|
9
|
+
|
10
|
+
command :scrape do |c|
|
11
|
+
c.syntax = 'cricos_scrape scrape [institutions|courses|contacts] [min_id:1] [max_id:10000]'
|
12
|
+
c.summary = 'Scrape entities from CRICOS'
|
13
|
+
c.description = c.summary
|
14
|
+
c.example 'Import institutions', 'cricos_scrape scrape institutions'
|
15
|
+
c.example 'Import institutions and persist to a file', 'cricos_scrape scrape institutions >> institutions.json'
|
16
|
+
c.example 'Import institutions with specified ID range [1-200]', 'cricos_scrape scrape institutions 1 200'
|
17
|
+
c.example 'Import courses', 'cricos_scrape scrape courses'
|
18
|
+
c.example 'Import courses and persist to a file', 'cricos_scrape scrape courses >> courses.json'
|
19
|
+
c.example 'Import courses with specified ID range [1-200]', 'cricos_scrape scrape courses 1 200'
|
20
|
+
c.example 'Import contacts', 'cricos_scrape scrape contacts. NOTE: ID range option does not apply'
|
21
|
+
c.example 'Import contacts and persist to a file', 'cricos_scrape scrape contacts >> contacts.json'
|
22
|
+
|
23
|
+
c.action do |args, options|
|
24
|
+
entity = args[0]
|
25
|
+
min_id = args[1] || 1
|
26
|
+
max_id = args[2] || 10000
|
27
|
+
|
28
|
+
case entity
|
29
|
+
when 'institutions'
|
30
|
+
CricosScrape::BulkImportInstitutions.new(min_id, max_id).perform
|
31
|
+
when 'courses'
|
32
|
+
CricosScrape::BulkImportCourses.new(min_id, max_id).perform
|
33
|
+
when 'contacts'
|
34
|
+
CricosScrape::ImportContacts.new.perform
|
35
|
+
else
|
36
|
+
STDERR.puts "[ERROR] Invalid entity. Please see `cricos_scrape scrape -h` for more details"
|
37
|
+
exit 1
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
data/cricos_scrape.gemspec
CHANGED
@@ -1,4 +1,7 @@
|
|
1
|
-
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'cricos_scrape/version'
|
2
5
|
|
3
6
|
Gem::Specification.new do |spec|
|
4
7
|
spec.name = 'cricos_scrape'
|
@@ -10,15 +13,10 @@ Gem::Specification.new do |spec|
|
|
10
13
|
spec.homepage = 'https://github.com/ruby-journal/cricos_scrape.rb'
|
11
14
|
spec.license = 'MIT'
|
12
15
|
|
13
|
-
spec.files =
|
14
|
-
|
15
|
-
'lib/cricos_scrape/*.rb',
|
16
|
-
'spec/*.rb',
|
17
|
-
'spec/fixtures/*.html']
|
18
|
-
|
19
|
-
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
16
|
+
spec.files = `git ls-files -z -- lib/* bin/* LICENSE.md README.md cricos_scrape.gemspec`.split("\x0")
|
17
|
+
spec.executables = ['cricos_scrape']
|
20
18
|
spec.test_files = spec.files.grep(%r{^(spec)/})
|
21
|
-
spec.required_ruby_version = '>= 2.
|
19
|
+
spec.required_ruby_version = '>= 2.0.0'
|
22
20
|
|
23
21
|
spec.require_paths = ['lib']
|
24
22
|
|
@@ -0,0 +1,4 @@
|
|
1
|
+
module CricosScrape
|
2
|
+
class Course < Struct.new(:course_id, :course_name, :course_code, :dual_qualification, :field_of_education, :broad_field, :narrow_field, :detailed_field, :course_level, :foundation_studies, :work_component, :course_language, :duration, :total_cost, :contact_officers, :location_ids)
|
3
|
+
end
|
4
|
+
end
|
@@ -0,0 +1,120 @@
|
|
1
|
+
require 'cricos_scrape/entities/contact'
|
2
|
+
require 'cricos_scrape/entities/address'
|
3
|
+
|
4
|
+
module CricosScrape
|
5
|
+
class ContactImporter
|
6
|
+
|
7
|
+
CONTACT_URL = 'http://cricos.education.gov.au/Contacts/CRICOSContacts.aspx'
|
8
|
+
STATES_CODE = ['ACT', 'NSW', 'NT', 'QLD', 'SA', 'TAS', 'VIC', 'WA']
|
9
|
+
|
10
|
+
def initialize(agent)
|
11
|
+
@agent = agent
|
12
|
+
end
|
13
|
+
|
14
|
+
def run
|
15
|
+
contacts = []
|
16
|
+
|
17
|
+
for state in STATES_CODE
|
18
|
+
@page = agent.get(url_for(state))
|
19
|
+
if exist_contacts_of_state?
|
20
|
+
@table_contains_contact = @page.at('#ctl00_cphDefaultPage_tabContainer_sheetDetails_cricosContactDetails_pnlContactLists table').children
|
21
|
+
|
22
|
+
number_of_rows_per_contact = 18
|
23
|
+
start_contact_row = 3
|
24
|
+
end_contact_row = @table_contains_contact.count - number_of_rows_per_contact
|
25
|
+
|
26
|
+
for i in (start_contact_row..end_contact_row).step(number_of_rows_per_contact)
|
27
|
+
@row_index = i
|
28
|
+
|
29
|
+
contact = Contact.new
|
30
|
+
contact.type_of_course = find_type_of_course
|
31
|
+
contact.name = find_name
|
32
|
+
contact.organisation = find_organisation
|
33
|
+
contact.postal_address = find_postal_address
|
34
|
+
contact.telephone = find_telephone
|
35
|
+
contact.facsimile = find_facsimile
|
36
|
+
contact.email = find_email
|
37
|
+
|
38
|
+
contacts << contact
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
contacts
|
44
|
+
end
|
45
|
+
|
46
|
+
private
|
47
|
+
|
48
|
+
attr_reader :agent, :page
|
49
|
+
|
50
|
+
def url_for(state_code)
|
51
|
+
"#{CONTACT_URL}?StateCode=#{state_code}"
|
52
|
+
end
|
53
|
+
|
54
|
+
def exist_contacts_of_state?
|
55
|
+
!!@page.at('#__tab_ctl00_cphDefaultPage_tabContainer_sheetDetails')
|
56
|
+
end
|
57
|
+
|
58
|
+
def find_value_of_field(field)
|
59
|
+
field.nil? ? nil : field.text.strip
|
60
|
+
end
|
61
|
+
|
62
|
+
def find_type_of_course
|
63
|
+
find_value_of_field(@table_contains_contact[@row_index])
|
64
|
+
end
|
65
|
+
|
66
|
+
def find_name
|
67
|
+
name_row = @table_contains_contact[@row_index+4].children
|
68
|
+
find_value_of_field(name_row[3]).empty? ? find_value_of_field(name_row[2]) : find_value_of_field(name_row[3])
|
69
|
+
end
|
70
|
+
|
71
|
+
def find_organisation
|
72
|
+
organisation_row = @table_contains_contact[@row_index+6].children
|
73
|
+
find_value_of_field(organisation_row[3])
|
74
|
+
end
|
75
|
+
|
76
|
+
def find_postal_address
|
77
|
+
address = Address.new
|
78
|
+
|
79
|
+
address_row = @table_contains_contact[@row_index+8].children
|
80
|
+
postal_address_cell = address_row[3].children
|
81
|
+
|
82
|
+
# delete <br>
|
83
|
+
lines = postal_address_cell - postal_address_cell.css('br')
|
84
|
+
address.address_line_1 = find_value_of_field(lines[0])
|
85
|
+
|
86
|
+
if line2 = find_value_of_field(lines[1])
|
87
|
+
address.suburb, address.state, address.postcode = extract_suburb_and_state_and_postcode_from(line2)
|
88
|
+
end
|
89
|
+
|
90
|
+
address
|
91
|
+
end
|
92
|
+
|
93
|
+
def extract_suburb_and_state_and_postcode_from(line)
|
94
|
+
line.scan(/^(.*)\s(#{australia_states_code_regex})\s(#{australia_postcode_regex})$/).first
|
95
|
+
end
|
96
|
+
|
97
|
+
def australia_states_code_regex
|
98
|
+
'ACT|NSW|NT|QLD|SA|TAS|VIC|WA'
|
99
|
+
end
|
100
|
+
|
101
|
+
def australia_postcode_regex
|
102
|
+
'\d{4}'
|
103
|
+
end
|
104
|
+
|
105
|
+
def find_telephone
|
106
|
+
telephone_row = @table_contains_contact[@row_index+10].children
|
107
|
+
find_value_of_field(telephone_row[3])
|
108
|
+
end
|
109
|
+
|
110
|
+
def find_facsimile
|
111
|
+
facsimile_row = @table_contains_contact[@row_index+12].children
|
112
|
+
find_value_of_field(facsimile_row[3])
|
113
|
+
end
|
114
|
+
|
115
|
+
def find_email
|
116
|
+
email_row = @table_contains_contact[@row_index+14].children
|
117
|
+
find_value_of_field(email_row[3])
|
118
|
+
end
|
119
|
+
end
|
120
|
+
end
|
@@ -0,0 +1,291 @@
|
|
1
|
+
require 'cricos_scrape/entities/course'
|
2
|
+
require 'cricos_scrape/entities/contact_officer'
|
3
|
+
|
4
|
+
module CricosScrape
|
5
|
+
class CourseImporter
|
6
|
+
|
7
|
+
COURSE_URL = 'http://cricos.education.gov.au/Course/CourseDetails.aspx'
|
8
|
+
|
9
|
+
def initialize(agent, **params)
|
10
|
+
@agent = agent
|
11
|
+
@course_id = params.fetch(:course_id)
|
12
|
+
@page = agent.get(url)
|
13
|
+
end
|
14
|
+
|
15
|
+
def run
|
16
|
+
return if course_not_found?
|
17
|
+
|
18
|
+
course = Course.new
|
19
|
+
course.course_id = course_id
|
20
|
+
course.course_name = find_course_name
|
21
|
+
course.course_code = find_course_code
|
22
|
+
course.dual_qualification = find_dual_qualification
|
23
|
+
course.field_of_education = find_field_of_education
|
24
|
+
course.broad_field = find_education_broad_field
|
25
|
+
course.narrow_field = find_education_narrow_field
|
26
|
+
course.detailed_field = find_education_detailed_field
|
27
|
+
course.course_level = find_course_level
|
28
|
+
course.foundation_studies = find_foundation_studies
|
29
|
+
course.work_component = find_work_component
|
30
|
+
course.course_language = find_course_language
|
31
|
+
course.duration = find_duration
|
32
|
+
course.total_cost = find_total_cost
|
33
|
+
|
34
|
+
course.contact_officers = find_contact_officers
|
35
|
+
course.location_ids = find_course_location
|
36
|
+
|
37
|
+
course
|
38
|
+
end
|
39
|
+
|
40
|
+
private
|
41
|
+
|
42
|
+
attr_reader :agent, :course_id, :page
|
43
|
+
|
44
|
+
def url
|
45
|
+
"#{COURSE_URL}?CourseID=#{course_id}"
|
46
|
+
end
|
47
|
+
|
48
|
+
# there is no record not found page
|
49
|
+
# instead a search page is returned
|
50
|
+
def course_not_found?
|
51
|
+
@page.at('#contentBody h1').text == "Course Search"
|
52
|
+
end
|
53
|
+
|
54
|
+
def find_value_of_field(field)
|
55
|
+
field.text.strip unless field.nil?
|
56
|
+
end
|
57
|
+
|
58
|
+
def find_course_name
|
59
|
+
field = @page.at('#ctl00_cphDefaultPage_tabContainer_sheetCourseDetail_courseDetail_lblCourseName')
|
60
|
+
find_value_of_field(field)
|
61
|
+
end
|
62
|
+
|
63
|
+
def find_course_code
|
64
|
+
field = @page.at('#ctl00_cphDefaultPage_tabContainer_sheetCourseDetail_courseDetail_lblCourseCode')
|
65
|
+
find_value_of_field(field)
|
66
|
+
end
|
67
|
+
|
68
|
+
def find_dual_qualification
|
69
|
+
field = @page.at('#ctl00_cphDefaultPage_tabContainer_sheetCourseDetail_courseDetail_lblDualQualification')
|
70
|
+
find_value_of_field(field)
|
71
|
+
end
|
72
|
+
|
73
|
+
def find_field_of_education
|
74
|
+
row = @page.at('#ctl00_cphDefaultPage_tabContainer_sheetCourseDetail_courseDetail_trFofEHeader').children
|
75
|
+
# NOTE: A space lookalike character might be returned. This is to ensure its conversion to a correct space
|
76
|
+
find_value_of_field(row[3]).ord == 160 ? '' : find_value_of_field(row[3])
|
77
|
+
end
|
78
|
+
|
79
|
+
def find_education_broad_field
|
80
|
+
field = @page.at('#ctl00_cphDefaultPage_tabContainer_sheetCourseDetail_courseDetail_lblFieldOfEducationBroad1')
|
81
|
+
find_value_of_field(field)
|
82
|
+
end
|
83
|
+
|
84
|
+
def find_education_narrow_field
|
85
|
+
field = @page.at('#ctl00_cphDefaultPage_tabContainer_sheetCourseDetail_courseDetail_lblFieldOfEducationNarrow1')
|
86
|
+
find_value_of_field(field)
|
87
|
+
end
|
88
|
+
|
89
|
+
def find_education_detailed_field
|
90
|
+
field = @page.at('#ctl00_cphDefaultPage_tabContainer_sheetCourseDetail_courseDetail_lblFieldOfEducationDetailed1')
|
91
|
+
find_value_of_field(field)
|
92
|
+
end
|
93
|
+
|
94
|
+
def find_course_level
|
95
|
+
field = @page.at('#ctl00_cphDefaultPage_tabContainer_sheetCourseDetail_courseDetail_lblCourseLevel')
|
96
|
+
find_value_of_field(field)
|
97
|
+
end
|
98
|
+
|
99
|
+
def find_foundation_studies
|
100
|
+
field = @page.at('#ctl00_cphDefaultPage_tabContainer_sheetCourseDetail_courseDetail_lblFoundationStudies')
|
101
|
+
find_value_of_field(field)
|
102
|
+
end
|
103
|
+
|
104
|
+
def find_work_component
|
105
|
+
field = @page.at('#ctl00_cphDefaultPage_tabContainer_sheetCourseDetail_courseDetail_lblWorkComponent')
|
106
|
+
find_value_of_field(field)
|
107
|
+
end
|
108
|
+
|
109
|
+
def find_course_language
|
110
|
+
field = @page.at('#ctl00_cphDefaultPage_tabContainer_sheetCourseDetail_courseDetail_lblCourseLanguage')
|
111
|
+
find_value_of_field(field)
|
112
|
+
end
|
113
|
+
|
114
|
+
def find_duration
|
115
|
+
field = @page.at('#ctl00_cphDefaultPage_tabContainer_sheetCourseDetail_courseDetail_lblDuration')
|
116
|
+
find_value_of_field(field)
|
117
|
+
end
|
118
|
+
|
119
|
+
def find_total_cost
|
120
|
+
field = @page.at('#ctl00_cphDefaultPage_tabContainer_sheetCourseDetail_courseDetail_lblTotalCourseCost')
|
121
|
+
find_value_of_field(field)
|
122
|
+
end
|
123
|
+
|
124
|
+
def find_contact_officers
|
125
|
+
contact_officers = []
|
126
|
+
|
127
|
+
contact_officers_list = @page.search('//div[starts-with(@id, "ctl00_cphDefaultPage_tabContainer_sheetContactDetail_contactDetail_pnl")]')
|
128
|
+
|
129
|
+
contact_officers_list.each do |contact_officer|
|
130
|
+
@contact_officer_area = contact_officer
|
131
|
+
@contact_officer_table = @contact_officer_area.at('table').children
|
132
|
+
|
133
|
+
if contains_contact_details_grid?
|
134
|
+
contact_officers += find_contact_officer_grid
|
135
|
+
else
|
136
|
+
contact_officers << find_contact_officer
|
137
|
+
end
|
138
|
+
end
|
139
|
+
|
140
|
+
contact_officers
|
141
|
+
end
|
142
|
+
|
143
|
+
def find_contact_officer_grid
|
144
|
+
contact_officers = []
|
145
|
+
|
146
|
+
excess_row_at_the_end_table = 2
|
147
|
+
data_row_start = 3
|
148
|
+
data_row_end = @contact_officer_table.count - excess_row_at_the_end_table
|
149
|
+
|
150
|
+
for i in data_row_start..data_row_end
|
151
|
+
contact_row = @contact_officer_table[i].children
|
152
|
+
|
153
|
+
contact = ContactOfficer.new
|
154
|
+
contact.role = find_contact_officer_role
|
155
|
+
contact.name = find_value_of_field(contact_row[1])
|
156
|
+
contact.phone = find_value_of_field(contact_row[2])
|
157
|
+
contact.fax = find_value_of_field(contact_row[3])
|
158
|
+
contact.email = find_value_of_field(contact_row[4])
|
159
|
+
|
160
|
+
contact_officers << contact
|
161
|
+
end
|
162
|
+
|
163
|
+
contact_officers
|
164
|
+
end
|
165
|
+
|
166
|
+
def find_contact_officer
|
167
|
+
contact = ContactOfficer.new
|
168
|
+
contact.role = find_contact_officer_role
|
169
|
+
contact.name = find_contact_officer_name
|
170
|
+
contact.title = find_contact_officer_title
|
171
|
+
contact.phone = find_contact_officer_phone
|
172
|
+
contact.fax = find_contact_officer_fax
|
173
|
+
contact.email = find_contact_officer_email
|
174
|
+
|
175
|
+
contact
|
176
|
+
end
|
177
|
+
|
178
|
+
def find_contact_officer_role
|
179
|
+
row = @contact_officer_area.children
|
180
|
+
find_value_of_field(row[1]).sub(':', '')
|
181
|
+
end
|
182
|
+
|
183
|
+
def find_contact_officer_name
|
184
|
+
row = @contact_officer_table[1].children
|
185
|
+
find_value_of_field(row[3])
|
186
|
+
end
|
187
|
+
|
188
|
+
def find_contact_officer_title
|
189
|
+
row = @contact_officer_table[3].children
|
190
|
+
find_value_of_field(row[3])
|
191
|
+
end
|
192
|
+
|
193
|
+
def find_contact_officer_phone
|
194
|
+
row = @contact_officer_table[5].children
|
195
|
+
find_value_of_field(row[3])
|
196
|
+
end
|
197
|
+
|
198
|
+
def find_contact_officer_fax
|
199
|
+
row = @contact_officer_table[7].children
|
200
|
+
find_value_of_field(row[3])
|
201
|
+
end
|
202
|
+
|
203
|
+
def find_contact_officer_email
|
204
|
+
row = @contact_officer_table[9]
|
205
|
+
find_value_of_field(row.children[3]) unless row.nil?
|
206
|
+
end
|
207
|
+
|
208
|
+
def contains_contact_details_grid?
|
209
|
+
contact_officer_area_css_id = @contact_officer_area.attributes['id'].text
|
210
|
+
@page.search("//*[@id='#{contact_officer_area_css_id}']/div/table[starts-with(@id, 'ctl00_cphDefaultPage_tabContainer_sheetContactDetail_contactDetail_grid')]").any?
|
211
|
+
end
|
212
|
+
|
213
|
+
#Get all locations of course
|
214
|
+
def find_course_location
|
215
|
+
location_ids = []
|
216
|
+
|
217
|
+
if location_results_paginated?
|
218
|
+
for page_number in 1..total_pages
|
219
|
+
jump_to_page(page_number)
|
220
|
+
location_ids += fetch_location_ids_from_current_page
|
221
|
+
end
|
222
|
+
else
|
223
|
+
location_ids += fetch_location_ids_from_current_page
|
224
|
+
end
|
225
|
+
|
226
|
+
location_ids
|
227
|
+
end
|
228
|
+
|
229
|
+
def pagination
|
230
|
+
@page.at('#ctl00_cphDefaultPage_tabContainer_sheetCourseDetail_courseLocationList_gridSearchResults .gridPager')
|
231
|
+
end
|
232
|
+
|
233
|
+
def location_results_paginated?
|
234
|
+
!!pagination
|
235
|
+
end
|
236
|
+
|
237
|
+
def total_pages
|
238
|
+
pagination.children[1].text.strip[/^Page [0-9]+ of ([0-9]+).*/, 1].to_i
|
239
|
+
end
|
240
|
+
|
241
|
+
def current_pagination_page
|
242
|
+
pagination.children[1].text.strip[/^Page ([0-9]+) of [0-9]+.*/, 1].to_i
|
243
|
+
end
|
244
|
+
|
245
|
+
def jump_to_page(page_number)
|
246
|
+
return @page if page_number == current_pagination_page
|
247
|
+
|
248
|
+
hidden_form = @page.form_with :id => "aspnetForm"
|
249
|
+
hidden_form['__EVENTTARGET'] = 'ctl00$cphDefaultPage$tabContainer$sheetCourseDetail$courseLocationList$gridSearchResults'
|
250
|
+
hidden_form['__EVENTARGUMENT'] = "Page$#{page_number}"
|
251
|
+
begin
|
252
|
+
@page = hidden_form.submit(nil, {'action' => 'change-page'})
|
253
|
+
rescue Mechanize::ResponseCodeError
|
254
|
+
sleep 5
|
255
|
+
scrape_course(course_id)
|
256
|
+
end
|
257
|
+
end
|
258
|
+
|
259
|
+
def get_location_id(row_index)
|
260
|
+
hidden_form = @page.form_with :id => "aspnetForm"
|
261
|
+
hidden_form['__EVENTTARGET'] = 'ctl00$cphDefaultPage$tabContainer$sheetCourseDetail$courseLocationList$gridSearchResults'
|
262
|
+
hidden_form['__EVENTARGUMENT'] = "click-#{row_index-3}"
|
263
|
+
begin
|
264
|
+
course_page = hidden_form.submit(nil, {'action' => 'get-location-id'})
|
265
|
+
rescue Mechanize::ResponseCodeError
|
266
|
+
sleep 5
|
267
|
+
scrape_course(course_id)
|
268
|
+
end
|
269
|
+
|
270
|
+
course_page.uri.to_s[/LocationID=([0-9]+)/, 1]
|
271
|
+
end
|
272
|
+
|
273
|
+
def fetch_location_ids_from_current_page
|
274
|
+
location_ids = []
|
275
|
+
|
276
|
+
# location_list is table contains locations in current page
|
277
|
+
location_list = @page.at('#ctl00_cphDefaultPage_tabContainer_sheetCourseDetail_courseLocationList_gridSearchResults').children
|
278
|
+
|
279
|
+
excess_row_at_the_end_table = location_results_paginated? ? 3 : 2
|
280
|
+
start_location_row = 3
|
281
|
+
end_location_row = location_list.count - excess_row_at_the_end_table
|
282
|
+
|
283
|
+
for i in start_location_row..end_location_row
|
284
|
+
location_ids << get_location_id(i)
|
285
|
+
end
|
286
|
+
|
287
|
+
location_ids
|
288
|
+
end
|
289
|
+
|
290
|
+
end
|
291
|
+
end
|