cricos_scrape 2.0 → 2.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +1 -1
- data/bin/cricos_scrape +40 -0
- data/cricos_scrape.gemspec +7 -9
- data/lib/cricos_scrape/bulk_import_courses.rb +2 -2
- data/lib/cricos_scrape/bulk_import_institutions.rb +2 -2
- data/lib/cricos_scrape/entities/address.rb +4 -0
- data/lib/cricos_scrape/entities/contact.rb +4 -0
- data/lib/cricos_scrape/entities/contact_officer.rb +4 -0
- data/lib/cricos_scrape/entities/course.rb +4 -0
- data/lib/cricos_scrape/entities/institution.rb +4 -0
- data/lib/cricos_scrape/entities/location.rb +4 -0
- data/lib/cricos_scrape/import_contacts.rb +2 -2
- data/lib/cricos_scrape/importer/contact_importer.rb +120 -0
- data/lib/cricos_scrape/importer/course_importer.rb +291 -0
- data/lib/cricos_scrape/importer/institution_importer.rb +279 -0
- data/lib/cricos_scrape/version.rb +1 -1
- data/lib/cricos_scrape.rb +4 -5
- metadata +16 -44
- data/CONTRIBUTING.md +0 -51
- data/Gemfile +0 -2
- data/Gemfile.lock +0 -64
- data/Procfile +0 -3
- data/Rakefile +0 -13
- data/spec/contact_importer_spec.rb +0 -76
- data/spec/course_importer_spec.rb +0 -71
- data/spec/fixtures/contact_details_of_state_act_uri.html +0 -546
- data/spec/fixtures/contact_details_of_state_wa_uri.html +0 -546
- data/spec/fixtures/course_details_with_contact_officers_table_grid.html +0 -467
- data/spec/fixtures/course_details_without_pagination_uri.html +0 -470
- data/spec/fixtures/courses_list_by_location_id_uri.html +0 -174
- data/spec/fixtures/institution_details_with_pagination_location_page_1_uri.html +0 -406
- data/spec/fixtures/institution_details_with_pagination_location_page_2_uri.html +0 -358
- data/spec/fixtures/institution_details_with_po_box_postal_address.html +0 -240
- data/spec/fixtures/institution_details_with_trading_name.html +0 -322
- data/spec/fixtures/institution_details_without_locations_details_uri.html +0 -151
- data/spec/fixtures/institution_details_without_pagination_location_uri.html +0 -299
- data/spec/fixtures/not_found_course_details_uri.html +0 -837
- data/spec/fixtures/not_found_institution_details.html +0 -36
- data/spec/institution_importer_spec.rb +0 -138
- data/spec/spec_helper.rb +0 -67
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 3961f2418e9bc173a4412c6d16a83408dfe2c280
|
4
|
+
data.tar.gz: fdf7c3c46869cd7e805a05e246b36649cacfaa19
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 698d1a4564472abee988d847e06c93f93f718102036b5eb3ddd68a795c86b6827302b21da9db2490b7a647922227d38da9551a76430ff593ffc9c7eed6a848e6
|
7
|
+
data.tar.gz: c4a81a6e2d005d3047f2170a85e66c4c10bdaf73196b068e5ddeb2a3fe0bfd14f1fe4a9a5e7d263aad08beb314c2532d41e4b16524aee47c3e5813a755a7f772
|
data/README.md
CHANGED
@@ -8,7 +8,7 @@
|
|
8
8
|
CRICOS lacks API for data retrieval (so are many government-based services). This gem
|
9
9
|
helps scrape data from [http://cricos.education.gov.au](http://cricos.education.gov.au).
|
10
10
|
|
11
|
-
This gem supports Ruby 2.
|
11
|
+
This gem supports MRI Ruby 2.0.0+.
|
12
12
|
|
13
13
|
# Features
|
14
14
|
|
data/bin/cricos_scrape
ADDED
@@ -0,0 +1,40 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'cricos_scrape'
|
4
|
+
require 'cricos_scrape/version'
|
5
|
+
require 'commander/import'
|
6
|
+
|
7
|
+
program :version, CricosScrape::VERSION.to_s
|
8
|
+
program :description, 'Scrape data from CRICOS website'
|
9
|
+
|
10
|
+
command :scrape do |c|
|
11
|
+
c.syntax = 'cricos_scrape scrape [institutions|courses|contacts] [min_id:1] [max_id:10000]'
|
12
|
+
c.summary = 'Scrape entities from CRICOS'
|
13
|
+
c.description = c.summary
|
14
|
+
c.example 'Import institutions', 'cricos_scrape scrape institutions'
|
15
|
+
c.example 'Import institutions and persist to a file', 'cricos_scrape scrape institutions >> institutions.json'
|
16
|
+
c.example 'Import institutions with specified ID range [1-200]', 'cricos_scrape scrape institutions 1 200'
|
17
|
+
c.example 'Import courses', 'cricos_scrape scrape courses'
|
18
|
+
c.example 'Import courses and persist to a file', 'cricos_scrape scrape courses >> courses.json'
|
19
|
+
c.example 'Import courses with specified ID range [1-200]', 'cricos_scrape scrape courses 1 200'
|
20
|
+
c.example 'Import contacts', 'cricos_scrape scrape contacts. NOTE: ID range option does not apply'
|
21
|
+
c.example 'Import contacts and persist to a file', 'cricos_scrape scrape contacts >> contacts.json'
|
22
|
+
|
23
|
+
c.action do |args, options|
|
24
|
+
entity = args[0]
|
25
|
+
min_id = args[1] || 1
|
26
|
+
max_id = args[2] || 10000
|
27
|
+
|
28
|
+
case entity
|
29
|
+
when 'institutions'
|
30
|
+
CricosScrape::BulkImportInstitutions.new(min_id, max_id).perform
|
31
|
+
when 'courses'
|
32
|
+
CricosScrape::BulkImportCourses.new(min_id, max_id).perform
|
33
|
+
when 'contacts'
|
34
|
+
CricosScrape::ImportContacts.new.perform
|
35
|
+
else
|
36
|
+
STDERR.puts "[ERROR] Invalid entity. Please see `cricos_scrape scrape -h` for more details"
|
37
|
+
exit 1
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
data/cricos_scrape.gemspec
CHANGED
@@ -1,4 +1,7 @@
|
|
1
|
-
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'cricos_scrape/version'
|
2
5
|
|
3
6
|
Gem::Specification.new do |spec|
|
4
7
|
spec.name = 'cricos_scrape'
|
@@ -10,15 +13,10 @@ Gem::Specification.new do |spec|
|
|
10
13
|
spec.homepage = 'https://github.com/ruby-journal/cricos_scrape.rb'
|
11
14
|
spec.license = 'MIT'
|
12
15
|
|
13
|
-
spec.files =
|
14
|
-
|
15
|
-
'lib/cricos_scrape/*.rb',
|
16
|
-
'spec/*.rb',
|
17
|
-
'spec/fixtures/*.html']
|
18
|
-
|
19
|
-
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
16
|
+
spec.files = `git ls-files -z -- lib/* bin/* LICENSE.md README.md cricos_scrape.gemspec`.split("\x0")
|
17
|
+
spec.executables = ['cricos_scrape']
|
20
18
|
spec.test_files = spec.files.grep(%r{^(spec)/})
|
21
|
-
spec.required_ruby_version = '>= 2.
|
19
|
+
spec.required_ruby_version = '>= 2.0.0'
|
22
20
|
|
23
21
|
spec.require_paths = ['lib']
|
24
22
|
|
@@ -0,0 +1,4 @@
|
|
1
|
+
module CricosScrape
|
2
|
+
class Course < Struct.new(:course_id, :course_name, :course_code, :dual_qualification, :field_of_education, :broad_field, :narrow_field, :detailed_field, :course_level, :foundation_studies, :work_component, :course_language, :duration, :total_cost, :contact_officers, :location_ids)
|
3
|
+
end
|
4
|
+
end
|
@@ -0,0 +1,120 @@
|
|
1
|
+
require 'cricos_scrape/entities/contact'
|
2
|
+
require 'cricos_scrape/entities/address'
|
3
|
+
|
4
|
+
module CricosScrape
|
5
|
+
class ContactImporter
|
6
|
+
|
7
|
+
CONTACT_URL = 'http://cricos.education.gov.au/Contacts/CRICOSContacts.aspx'
|
8
|
+
STATES_CODE = ['ACT', 'NSW', 'NT', 'QLD', 'SA', 'TAS', 'VIC', 'WA']
|
9
|
+
|
10
|
+
def initialize(agent)
|
11
|
+
@agent = agent
|
12
|
+
end
|
13
|
+
|
14
|
+
def run
|
15
|
+
contacts = []
|
16
|
+
|
17
|
+
for state in STATES_CODE
|
18
|
+
@page = agent.get(url_for(state))
|
19
|
+
if exist_contacts_of_state?
|
20
|
+
@table_contains_contact = @page.at('#ctl00_cphDefaultPage_tabContainer_sheetDetails_cricosContactDetails_pnlContactLists table').children
|
21
|
+
|
22
|
+
number_of_rows_per_contact = 18
|
23
|
+
start_contact_row = 3
|
24
|
+
end_contact_row = @table_contains_contact.count - number_of_rows_per_contact
|
25
|
+
|
26
|
+
for i in (start_contact_row..end_contact_row).step(number_of_rows_per_contact)
|
27
|
+
@row_index = i
|
28
|
+
|
29
|
+
contact = Contact.new
|
30
|
+
contact.type_of_course = find_type_of_course
|
31
|
+
contact.name = find_name
|
32
|
+
contact.organisation = find_organisation
|
33
|
+
contact.postal_address = find_postal_address
|
34
|
+
contact.telephone = find_telephone
|
35
|
+
contact.facsimile = find_facsimile
|
36
|
+
contact.email = find_email
|
37
|
+
|
38
|
+
contacts << contact
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
contacts
|
44
|
+
end
|
45
|
+
|
46
|
+
private
|
47
|
+
|
48
|
+
attr_reader :agent, :page
|
49
|
+
|
50
|
+
def url_for(state_code)
|
51
|
+
"#{CONTACT_URL}?StateCode=#{state_code}"
|
52
|
+
end
|
53
|
+
|
54
|
+
def exist_contacts_of_state?
|
55
|
+
!!@page.at('#__tab_ctl00_cphDefaultPage_tabContainer_sheetDetails')
|
56
|
+
end
|
57
|
+
|
58
|
+
def find_value_of_field(field)
|
59
|
+
field.nil? ? nil : field.text.strip
|
60
|
+
end
|
61
|
+
|
62
|
+
def find_type_of_course
|
63
|
+
find_value_of_field(@table_contains_contact[@row_index])
|
64
|
+
end
|
65
|
+
|
66
|
+
def find_name
|
67
|
+
name_row = @table_contains_contact[@row_index+4].children
|
68
|
+
find_value_of_field(name_row[3]).empty? ? find_value_of_field(name_row[2]) : find_value_of_field(name_row[3])
|
69
|
+
end
|
70
|
+
|
71
|
+
def find_organisation
|
72
|
+
organisation_row = @table_contains_contact[@row_index+6].children
|
73
|
+
find_value_of_field(organisation_row[3])
|
74
|
+
end
|
75
|
+
|
76
|
+
def find_postal_address
|
77
|
+
address = Address.new
|
78
|
+
|
79
|
+
address_row = @table_contains_contact[@row_index+8].children
|
80
|
+
postal_address_cell = address_row[3].children
|
81
|
+
|
82
|
+
# delete <br>
|
83
|
+
lines = postal_address_cell - postal_address_cell.css('br')
|
84
|
+
address.address_line_1 = find_value_of_field(lines[0])
|
85
|
+
|
86
|
+
if line2 = find_value_of_field(lines[1])
|
87
|
+
address.suburb, address.state, address.postcode = extract_suburb_and_state_and_postcode_from(line2)
|
88
|
+
end
|
89
|
+
|
90
|
+
address
|
91
|
+
end
|
92
|
+
|
93
|
+
def extract_suburb_and_state_and_postcode_from(line)
|
94
|
+
line.scan(/^(.*)\s(#{australia_states_code_regex})\s(#{australia_postcode_regex})$/).first
|
95
|
+
end
|
96
|
+
|
97
|
+
def australia_states_code_regex
|
98
|
+
'ACT|NSW|NT|QLD|SA|TAS|VIC|WA'
|
99
|
+
end
|
100
|
+
|
101
|
+
def australia_postcode_regex
|
102
|
+
'\d{4}'
|
103
|
+
end
|
104
|
+
|
105
|
+
def find_telephone
|
106
|
+
telephone_row = @table_contains_contact[@row_index+10].children
|
107
|
+
find_value_of_field(telephone_row[3])
|
108
|
+
end
|
109
|
+
|
110
|
+
def find_facsimile
|
111
|
+
facsimile_row = @table_contains_contact[@row_index+12].children
|
112
|
+
find_value_of_field(facsimile_row[3])
|
113
|
+
end
|
114
|
+
|
115
|
+
def find_email
|
116
|
+
email_row = @table_contains_contact[@row_index+14].children
|
117
|
+
find_value_of_field(email_row[3])
|
118
|
+
end
|
119
|
+
end
|
120
|
+
end
|
@@ -0,0 +1,291 @@
|
|
1
|
+
require 'cricos_scrape/entities/course'
|
2
|
+
require 'cricos_scrape/entities/contact_officer'
|
3
|
+
|
4
|
+
module CricosScrape
|
5
|
+
class CourseImporter
|
6
|
+
|
7
|
+
COURSE_URL = 'http://cricos.education.gov.au/Course/CourseDetails.aspx'
|
8
|
+
|
9
|
+
def initialize(agent, **params)
|
10
|
+
@agent = agent
|
11
|
+
@course_id = params.fetch(:course_id)
|
12
|
+
@page = agent.get(url)
|
13
|
+
end
|
14
|
+
|
15
|
+
def run
|
16
|
+
return if course_not_found?
|
17
|
+
|
18
|
+
course = Course.new
|
19
|
+
course.course_id = course_id
|
20
|
+
course.course_name = find_course_name
|
21
|
+
course.course_code = find_course_code
|
22
|
+
course.dual_qualification = find_dual_qualification
|
23
|
+
course.field_of_education = find_field_of_education
|
24
|
+
course.broad_field = find_education_broad_field
|
25
|
+
course.narrow_field = find_education_narrow_field
|
26
|
+
course.detailed_field = find_education_detailed_field
|
27
|
+
course.course_level = find_course_level
|
28
|
+
course.foundation_studies = find_foundation_studies
|
29
|
+
course.work_component = find_work_component
|
30
|
+
course.course_language = find_course_language
|
31
|
+
course.duration = find_duration
|
32
|
+
course.total_cost = find_total_cost
|
33
|
+
|
34
|
+
course.contact_officers = find_contact_officers
|
35
|
+
course.location_ids = find_course_location
|
36
|
+
|
37
|
+
course
|
38
|
+
end
|
39
|
+
|
40
|
+
private
|
41
|
+
|
42
|
+
attr_reader :agent, :course_id, :page
|
43
|
+
|
44
|
+
def url
|
45
|
+
"#{COURSE_URL}?CourseID=#{course_id}"
|
46
|
+
end
|
47
|
+
|
48
|
+
# there is no record not found page
|
49
|
+
# instead a search page is returned
|
50
|
+
def course_not_found?
|
51
|
+
@page.at('#contentBody h1').text == "Course Search"
|
52
|
+
end
|
53
|
+
|
54
|
+
def find_value_of_field(field)
|
55
|
+
field.text.strip unless field.nil?
|
56
|
+
end
|
57
|
+
|
58
|
+
def find_course_name
|
59
|
+
field = @page.at('#ctl00_cphDefaultPage_tabContainer_sheetCourseDetail_courseDetail_lblCourseName')
|
60
|
+
find_value_of_field(field)
|
61
|
+
end
|
62
|
+
|
63
|
+
def find_course_code
|
64
|
+
field = @page.at('#ctl00_cphDefaultPage_tabContainer_sheetCourseDetail_courseDetail_lblCourseCode')
|
65
|
+
find_value_of_field(field)
|
66
|
+
end
|
67
|
+
|
68
|
+
def find_dual_qualification
|
69
|
+
field = @page.at('#ctl00_cphDefaultPage_tabContainer_sheetCourseDetail_courseDetail_lblDualQualification')
|
70
|
+
find_value_of_field(field)
|
71
|
+
end
|
72
|
+
|
73
|
+
def find_field_of_education
|
74
|
+
row = @page.at('#ctl00_cphDefaultPage_tabContainer_sheetCourseDetail_courseDetail_trFofEHeader').children
|
75
|
+
# NOTE: A space lookalike character might be returned. This is to ensure its conversion to a correct space
|
76
|
+
find_value_of_field(row[3]).ord == 160 ? '' : find_value_of_field(row[3])
|
77
|
+
end
|
78
|
+
|
79
|
+
def find_education_broad_field
|
80
|
+
field = @page.at('#ctl00_cphDefaultPage_tabContainer_sheetCourseDetail_courseDetail_lblFieldOfEducationBroad1')
|
81
|
+
find_value_of_field(field)
|
82
|
+
end
|
83
|
+
|
84
|
+
def find_education_narrow_field
|
85
|
+
field = @page.at('#ctl00_cphDefaultPage_tabContainer_sheetCourseDetail_courseDetail_lblFieldOfEducationNarrow1')
|
86
|
+
find_value_of_field(field)
|
87
|
+
end
|
88
|
+
|
89
|
+
def find_education_detailed_field
|
90
|
+
field = @page.at('#ctl00_cphDefaultPage_tabContainer_sheetCourseDetail_courseDetail_lblFieldOfEducationDetailed1')
|
91
|
+
find_value_of_field(field)
|
92
|
+
end
|
93
|
+
|
94
|
+
def find_course_level
|
95
|
+
field = @page.at('#ctl00_cphDefaultPage_tabContainer_sheetCourseDetail_courseDetail_lblCourseLevel')
|
96
|
+
find_value_of_field(field)
|
97
|
+
end
|
98
|
+
|
99
|
+
def find_foundation_studies
|
100
|
+
field = @page.at('#ctl00_cphDefaultPage_tabContainer_sheetCourseDetail_courseDetail_lblFoundationStudies')
|
101
|
+
find_value_of_field(field)
|
102
|
+
end
|
103
|
+
|
104
|
+
def find_work_component
|
105
|
+
field = @page.at('#ctl00_cphDefaultPage_tabContainer_sheetCourseDetail_courseDetail_lblWorkComponent')
|
106
|
+
find_value_of_field(field)
|
107
|
+
end
|
108
|
+
|
109
|
+
def find_course_language
|
110
|
+
field = @page.at('#ctl00_cphDefaultPage_tabContainer_sheetCourseDetail_courseDetail_lblCourseLanguage')
|
111
|
+
find_value_of_field(field)
|
112
|
+
end
|
113
|
+
|
114
|
+
def find_duration
|
115
|
+
field = @page.at('#ctl00_cphDefaultPage_tabContainer_sheetCourseDetail_courseDetail_lblDuration')
|
116
|
+
find_value_of_field(field)
|
117
|
+
end
|
118
|
+
|
119
|
+
def find_total_cost
|
120
|
+
field = @page.at('#ctl00_cphDefaultPage_tabContainer_sheetCourseDetail_courseDetail_lblTotalCourseCost')
|
121
|
+
find_value_of_field(field)
|
122
|
+
end
|
123
|
+
|
124
|
+
def find_contact_officers
|
125
|
+
contact_officers = []
|
126
|
+
|
127
|
+
contact_officers_list = @page.search('//div[starts-with(@id, "ctl00_cphDefaultPage_tabContainer_sheetContactDetail_contactDetail_pnl")]')
|
128
|
+
|
129
|
+
contact_officers_list.each do |contact_officer|
|
130
|
+
@contact_officer_area = contact_officer
|
131
|
+
@contact_officer_table = @contact_officer_area.at('table').children
|
132
|
+
|
133
|
+
if contains_contact_details_grid?
|
134
|
+
contact_officers += find_contact_officer_grid
|
135
|
+
else
|
136
|
+
contact_officers << find_contact_officer
|
137
|
+
end
|
138
|
+
end
|
139
|
+
|
140
|
+
contact_officers
|
141
|
+
end
|
142
|
+
|
143
|
+
def find_contact_officer_grid
|
144
|
+
contact_officers = []
|
145
|
+
|
146
|
+
excess_row_at_the_end_table = 2
|
147
|
+
data_row_start = 3
|
148
|
+
data_row_end = @contact_officer_table.count - excess_row_at_the_end_table
|
149
|
+
|
150
|
+
for i in data_row_start..data_row_end
|
151
|
+
contact_row = @contact_officer_table[i].children
|
152
|
+
|
153
|
+
contact = ContactOfficer.new
|
154
|
+
contact.role = find_contact_officer_role
|
155
|
+
contact.name = find_value_of_field(contact_row[1])
|
156
|
+
contact.phone = find_value_of_field(contact_row[2])
|
157
|
+
contact.fax = find_value_of_field(contact_row[3])
|
158
|
+
contact.email = find_value_of_field(contact_row[4])
|
159
|
+
|
160
|
+
contact_officers << contact
|
161
|
+
end
|
162
|
+
|
163
|
+
contact_officers
|
164
|
+
end
|
165
|
+
|
166
|
+
def find_contact_officer
|
167
|
+
contact = ContactOfficer.new
|
168
|
+
contact.role = find_contact_officer_role
|
169
|
+
contact.name = find_contact_officer_name
|
170
|
+
contact.title = find_contact_officer_title
|
171
|
+
contact.phone = find_contact_officer_phone
|
172
|
+
contact.fax = find_contact_officer_fax
|
173
|
+
contact.email = find_contact_officer_email
|
174
|
+
|
175
|
+
contact
|
176
|
+
end
|
177
|
+
|
178
|
+
def find_contact_officer_role
|
179
|
+
row = @contact_officer_area.children
|
180
|
+
find_value_of_field(row[1]).sub(':', '')
|
181
|
+
end
|
182
|
+
|
183
|
+
def find_contact_officer_name
|
184
|
+
row = @contact_officer_table[1].children
|
185
|
+
find_value_of_field(row[3])
|
186
|
+
end
|
187
|
+
|
188
|
+
def find_contact_officer_title
|
189
|
+
row = @contact_officer_table[3].children
|
190
|
+
find_value_of_field(row[3])
|
191
|
+
end
|
192
|
+
|
193
|
+
def find_contact_officer_phone
|
194
|
+
row = @contact_officer_table[5].children
|
195
|
+
find_value_of_field(row[3])
|
196
|
+
end
|
197
|
+
|
198
|
+
def find_contact_officer_fax
|
199
|
+
row = @contact_officer_table[7].children
|
200
|
+
find_value_of_field(row[3])
|
201
|
+
end
|
202
|
+
|
203
|
+
def find_contact_officer_email
|
204
|
+
row = @contact_officer_table[9]
|
205
|
+
find_value_of_field(row.children[3]) unless row.nil?
|
206
|
+
end
|
207
|
+
|
208
|
+
def contains_contact_details_grid?
|
209
|
+
contact_officer_area_css_id = @contact_officer_area.attributes['id'].text
|
210
|
+
@page.search("//*[@id='#{contact_officer_area_css_id}']/div/table[starts-with(@id, 'ctl00_cphDefaultPage_tabContainer_sheetContactDetail_contactDetail_grid')]").any?
|
211
|
+
end
|
212
|
+
|
213
|
+
#Get all locations of course
|
214
|
+
def find_course_location
|
215
|
+
location_ids = []
|
216
|
+
|
217
|
+
if location_results_paginated?
|
218
|
+
for page_number in 1..total_pages
|
219
|
+
jump_to_page(page_number)
|
220
|
+
location_ids += fetch_location_ids_from_current_page
|
221
|
+
end
|
222
|
+
else
|
223
|
+
location_ids += fetch_location_ids_from_current_page
|
224
|
+
end
|
225
|
+
|
226
|
+
location_ids
|
227
|
+
end
|
228
|
+
|
229
|
+
def pagination
|
230
|
+
@page.at('#ctl00_cphDefaultPage_tabContainer_sheetCourseDetail_courseLocationList_gridSearchResults .gridPager')
|
231
|
+
end
|
232
|
+
|
233
|
+
def location_results_paginated?
|
234
|
+
!!pagination
|
235
|
+
end
|
236
|
+
|
237
|
+
def total_pages
|
238
|
+
pagination.children[1].text.strip[/^Page [0-9]+ of ([0-9]+).*/, 1].to_i
|
239
|
+
end
|
240
|
+
|
241
|
+
def current_pagination_page
|
242
|
+
pagination.children[1].text.strip[/^Page ([0-9]+) of [0-9]+.*/, 1].to_i
|
243
|
+
end
|
244
|
+
|
245
|
+
def jump_to_page(page_number)
|
246
|
+
return @page if page_number == current_pagination_page
|
247
|
+
|
248
|
+
hidden_form = @page.form_with :id => "aspnetForm"
|
249
|
+
hidden_form['__EVENTTARGET'] = 'ctl00$cphDefaultPage$tabContainer$sheetCourseDetail$courseLocationList$gridSearchResults'
|
250
|
+
hidden_form['__EVENTARGUMENT'] = "Page$#{page_number}"
|
251
|
+
begin
|
252
|
+
@page = hidden_form.submit(nil, {'action' => 'change-page'})
|
253
|
+
rescue Mechanize::ResponseCodeError
|
254
|
+
sleep 5
|
255
|
+
scrape_course(course_id)
|
256
|
+
end
|
257
|
+
end
|
258
|
+
|
259
|
+
def get_location_id(row_index)
|
260
|
+
hidden_form = @page.form_with :id => "aspnetForm"
|
261
|
+
hidden_form['__EVENTTARGET'] = 'ctl00$cphDefaultPage$tabContainer$sheetCourseDetail$courseLocationList$gridSearchResults'
|
262
|
+
hidden_form['__EVENTARGUMENT'] = "click-#{row_index-3}"
|
263
|
+
begin
|
264
|
+
course_page = hidden_form.submit(nil, {'action' => 'get-location-id'})
|
265
|
+
rescue Mechanize::ResponseCodeError
|
266
|
+
sleep 5
|
267
|
+
scrape_course(course_id)
|
268
|
+
end
|
269
|
+
|
270
|
+
course_page.uri.to_s[/LocationID=([0-9]+)/, 1]
|
271
|
+
end
|
272
|
+
|
273
|
+
def fetch_location_ids_from_current_page
|
274
|
+
location_ids = []
|
275
|
+
|
276
|
+
# location_list is table contains locations in current page
|
277
|
+
location_list = @page.at('#ctl00_cphDefaultPage_tabContainer_sheetCourseDetail_courseLocationList_gridSearchResults').children
|
278
|
+
|
279
|
+
excess_row_at_the_end_table = location_results_paginated? ? 3 : 2
|
280
|
+
start_location_row = 3
|
281
|
+
end_location_row = location_list.count - excess_row_at_the_end_table
|
282
|
+
|
283
|
+
for i in start_location_row..end_location_row
|
284
|
+
location_ids << get_location_id(i)
|
285
|
+
end
|
286
|
+
|
287
|
+
location_ids
|
288
|
+
end
|
289
|
+
|
290
|
+
end
|
291
|
+
end
|