cricos_scrape 2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. checksums.yaml +7 -0
  2. data/CONTRIBUTING.md +51 -0
  3. data/Gemfile +2 -0
  4. data/Gemfile.lock +64 -0
  5. data/LICENSE.md +22 -0
  6. data/Procfile +3 -0
  7. data/README.md +40 -0
  8. data/Rakefile +13 -0
  9. data/cricos_scrape.gemspec +31 -0
  10. data/lib/cricos_scrape/agent.rb +9 -0
  11. data/lib/cricos_scrape/bulk_import_courses.rb +31 -0
  12. data/lib/cricos_scrape/bulk_import_institutions.rb +31 -0
  13. data/lib/cricos_scrape/import_contacts.rb +22 -0
  14. data/lib/cricos_scrape/json_struct.rb +11 -0
  15. data/lib/cricos_scrape/version.rb +3 -0
  16. data/lib/cricos_scrape.rb +8 -0
  17. data/spec/contact_importer_spec.rb +76 -0
  18. data/spec/course_importer_spec.rb +71 -0
  19. data/spec/fixtures/contact_details_of_state_act_uri.html +546 -0
  20. data/spec/fixtures/contact_details_of_state_wa_uri.html +546 -0
  21. data/spec/fixtures/course_details_with_contact_officers_table_grid.html +467 -0
  22. data/spec/fixtures/course_details_without_pagination_uri.html +470 -0
  23. data/spec/fixtures/courses_list_by_location_id_uri.html +174 -0
  24. data/spec/fixtures/institution_details_with_pagination_location_page_1_uri.html +406 -0
  25. data/spec/fixtures/institution_details_with_pagination_location_page_2_uri.html +358 -0
  26. data/spec/fixtures/institution_details_with_po_box_postal_address.html +240 -0
  27. data/spec/fixtures/institution_details_with_trading_name.html +322 -0
  28. data/spec/fixtures/institution_details_without_locations_details_uri.html +151 -0
  29. data/spec/fixtures/institution_details_without_pagination_location_uri.html +299 -0
  30. data/spec/fixtures/not_found_course_details_uri.html +837 -0
  31. data/spec/fixtures/not_found_institution_details.html +36 -0
  32. data/spec/institution_importer_spec.rb +138 -0
  33. data/spec/spec_helper.rb +67 -0
  34. metadata +190 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 2cb41efd407804298343699766a9d530e4d51caf
4
+ data.tar.gz: 8ec6ecfb82d9ce2c2171297b8ef3026008e0eaaa
5
+ SHA512:
6
+ metadata.gz: 47b66ae22e51f5846aa2aaac0bb4e93f4ceb1ce94d4ae81dab0179b0eaa95ed399d64bdf551c6a1416fac8059119fbd78292f9e963a9caf7364b5dcf5a090844
7
+ data.tar.gz: e2c8f522a4444ae73feff3d6ccc40c8c55017c1aa64ceec48cf12176c56e7876597f679cc6bcdfe8ba9a2a29182ad8746d7b2254b90321f09a975f3af4d6eecf
data/CONTRIBUTING.md ADDED
@@ -0,0 +1,51 @@
1
+ Lotus is an open source project and we would love you to help us make it better.
2
+
3
+ ## Reporting Issues
4
+
5
+ A well formatted issue is appreciated, and goes a long way in helping us help you.
6
+
7
+ * Make sure you have a [GitHub account](https://github.com/signup/free)
8
+ * Submit a [Github issue](./issues) by:
9
+ * Clearly describing the issue
10
+ * Provide a descriptive summary
11
+ * Explain the expected behavior
12
+ * Explain the actual behavior
13
+ * Provide steps to reproduce the actual behavior
14
+ * Provide your application's complete `Gemfile.lock` as text (in a [Gist](https://gist.github.com) for bonus points)
15
+ * Any relevant stack traces
16
+
17
+ If you provide code, make sure it is formatted with the triple backticks (\`).
18
+
19
+ At this point, we'd love to tell you how long it will take for us to respond,
20
+ but we just don't know.
21
+
22
+ ## Pull requests
23
+
24
+ We accept pull requests to Lotus for:
25
+
26
+ * Adding documentation
27
+ * Fixing bugs
28
+ * Adding new features
29
+
30
+ Not all features proposed will be added but we are open to having a conversation
31
+ about a feature you are championing.
32
+
33
+ Here's a quick guide:
34
+
35
+ 1. Fork the repo.
36
+
37
+ 2. Run the tests. This is to make sure your starting point works. Tests can be
38
+ run via `rake`
39
+
40
+ 3. Create a new branch and make your changes. This includes tests for features!
41
+
42
+ 4. Push to your fork and submit a pull request. For more information, see
43
+ [Github's pull request help section](https://help.github.com/articles/using-pull-requests/).
44
+
45
+ At this point you're waiting on us. Expect a conversation regarding your pull
46
+ request; Questions, clarifications, and so on.
47
+
48
+ Some things that will increase the chance that your pull request is accepted:
49
+
50
+ * Include tests that fail without your code, and pass with it
51
+ * Update the documentation
data/Gemfile ADDED
@@ -0,0 +1,2 @@
1
+ source 'https://rubygems.org'
2
+ gemspec
data/Gemfile.lock ADDED
@@ -0,0 +1,64 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ cricos_scrape (2.0)
5
+ mechanize (~> 2.7, >= 2.7.2)
6
+ slop (~> 4.2.0, >= 4.2.0)
7
+
8
+ GEM
9
+ remote: https://rubygems.org/
10
+ specs:
11
+ diff-lcs (1.2.5)
12
+ domain_name (0.5.25)
13
+ unf (>= 0.0.5, < 1.0.0)
14
+ http-cookie (1.0.2)
15
+ domain_name (~> 0.5)
16
+ mechanize (2.7.3)
17
+ domain_name (~> 0.5, >= 0.5.1)
18
+ http-cookie (~> 1.0)
19
+ mime-types (~> 2.0)
20
+ net-http-digest_auth (~> 1.1, >= 1.1.1)
21
+ net-http-persistent (~> 2.5, >= 2.5.2)
22
+ nokogiri (~> 1.4)
23
+ ntlm-http (~> 0.1, >= 0.1.1)
24
+ webrobots (>= 0.0.9, < 0.2)
25
+ mime-types (2.6.2)
26
+ mini_portile (0.6.2)
27
+ net-http-digest_auth (1.4)
28
+ net-http-persistent (2.9.4)
29
+ nokogiri (1.6.6.2)
30
+ mini_portile (~> 0.6.0)
31
+ ntlm-http (0.1.1)
32
+ rspec (3.3.0)
33
+ rspec-core (~> 3.3.0)
34
+ rspec-expectations (~> 3.3.0)
35
+ rspec-mocks (~> 3.3.0)
36
+ rspec-core (3.3.2)
37
+ rspec-support (~> 3.3.0)
38
+ rspec-expectations (3.3.1)
39
+ diff-lcs (>= 1.2.0, < 2.0)
40
+ rspec-support (~> 3.3.0)
41
+ rspec-its (1.2.0)
42
+ rspec-core (>= 3.0.0)
43
+ rspec-expectations (>= 3.0.0)
44
+ rspec-mocks (3.3.2)
45
+ diff-lcs (>= 1.2.0, < 2.0)
46
+ rspec-support (~> 3.3.0)
47
+ rspec-support (3.3.0)
48
+ slop (4.2.0)
49
+ unf (0.1.4)
50
+ unf_ext
51
+ unf_ext (0.0.7.1)
52
+ webrobots (0.1.1)
53
+
54
+ PLATFORMS
55
+ ruby
56
+
57
+ DEPENDENCIES
58
+ bundler (~> 1.6)
59
+ cricos_scrape!
60
+ rspec (~> 3.3.0, >= 3.3.0)
61
+ rspec-its (~> 1.2.0, >= 1.2.0)
62
+
63
+ BUNDLED WITH
64
+ 1.10.6
data/LICENSE.md ADDED
@@ -0,0 +1,22 @@
1
+ Copyright © 2014-2015 Trung Lê
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/Procfile ADDED
@@ -0,0 +1,3 @@
1
+ job_1: bundle exec rake import:institutions MIN_ID=1 MAX_ID=10000
2
+ job_2: bundle exec rake import:courses
3
+ job_3: bundle exec rake import:contacts OVERWRITE=true
data/README.md ADDED
@@ -0,0 +1,40 @@
1
+ [![Build Status](https://travis-ci.org/ruby-journal/cricos_scrape.rb.svg)](https://travis-ci.org/ruby-journal/cricos_scrape.rb)
2
+
3
+
4
+ # CRICOS Scraper
5
+
6
+ ![CRICOS Logo](http://cricos.education.gov.au/images/cricos.gif)
7
+
8
+ CRICOS lacks API for data retrieval (so are many government-based services). This gem
9
+ helps scrape data from [http://cricos.education.gov.au](http://cricos.education.gov.au).
10
+
11
+ This gem supports Ruby 2.2.3+ only.
12
+
13
+ # Features
14
+
15
+ Support scrapping following entities:
16
+
17
+ * Institution
18
+ * Course
19
+ * Contact
20
+
21
+ # Installation
22
+
23
+ ```
24
+ gem install cricos_scrape
25
+ ```
26
+
27
+ # Usage
28
+
29
+ Please consult `cricos_scrape -h` command line.
30
+
31
+ # Testing
32
+
33
+ The tests are in the spec directory. Here syntax to test this gem.
34
+ ```
35
+ rspec
36
+ ```
37
+
38
+ # License
39
+
40
+ MIT
data/Rakefile ADDED
@@ -0,0 +1,13 @@
1
+ require 'rspec/core/rake_task'
2
+ RSpec::Core::RakeTask.new
3
+
4
+ require_relative 'lib/cricos_scrape'
5
+ namespace :import do
6
+
7
+
8
+ task :contacts do
9
+ output_file = ENV['OUTPUT_FILE'] || 'contacts.json'
10
+ CricosScrape::BulkImportContacts::new(output_file, ENV['OVERWRITE']).perform
11
+ end
12
+
13
+ end
@@ -0,0 +1,31 @@
1
+ require './lib/cricos_scrape/version'
2
+
3
+ Gem::Specification.new do |spec|
4
+ spec.name = 'cricos_scrape'
5
+ spec.version = CricosScrape::VERSION
6
+ spec.authors = ['Trung Lê', 'Toàn Lê']
7
+ spec.email = ['trung.le@ruby-journal.com', 'ktoanlba@gmail.com']
8
+ spec.summary = %q{CRICOS Scrape}
9
+ spec.description = %q{Scrape Institutions, Courses, Contacts from CRICOS}
10
+ spec.homepage = 'https://github.com/ruby-journal/cricos_scrape.rb'
11
+ spec.license = 'MIT'
12
+
13
+ spec.files = Dir['[A-Z]*',
14
+ 'lib/*.rb',
15
+ 'lib/cricos_scrape/*.rb',
16
+ 'spec/*.rb',
17
+ 'spec/fixtures/*.html']
18
+
19
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
20
+ spec.test_files = spec.files.grep(%r{^(spec)/})
21
+ spec.required_ruby_version = '>= 2.2.2'
22
+
23
+ spec.require_paths = ['lib']
24
+
25
+ spec.add_development_dependency 'bundler', '~> 1.6'
26
+ spec.add_development_dependency 'rspec', '~> 3.3.0', '>= 3.3.0'
27
+ spec.add_development_dependency 'rspec-its', '~> 1.2.0', '>= 1.2.0'
28
+
29
+ spec.add_runtime_dependency 'mechanize', '~> 2.7', '>= 2.7.2'
30
+ spec.add_runtime_dependency 'slop', '~> 4.2.0', '>= 4.2.0'
31
+ end
@@ -0,0 +1,9 @@
1
+ require 'mechanize'
2
+
3
+ module CricosScrape
4
+ def self.agent
5
+ agent = Mechanize.new
6
+ agent.user_agent = Mechanize::AGENT_ALIASES['Windows IE 6']
7
+ agent
8
+ end
9
+ end
@@ -0,0 +1,31 @@
1
+ require_relative './importer/course_importer'
2
+ require_relative './agent'
3
+
4
+ module CricosScrape
5
+ class BulkImportCourses
6
+ def initialize(min_id=0, max_id=10000)
7
+ @range = (min_id..max_id).to_a
8
+ @agent = CricosScrape.agent
9
+ end
10
+
11
+ def perform
12
+ @range.each do |course_id|
13
+ scrape(course_id)
14
+ end
15
+ end
16
+
17
+ private
18
+
19
+ attr_reader :min_id, :max_id, :input, :agent
20
+
21
+ def scrape(course_id)
22
+ course = CourseImporter.new(agent, course_id: course_id).run
23
+
24
+ if course
25
+ puts course.to_json
26
+ else
27
+ STDERR.puts "Could not find course with Course ID #{course_id}"
28
+ end
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,31 @@
1
+ require_relative './importer/institution_importer'
2
+ require_relative './agent'
3
+
4
+ module CricosScrape
5
+ class BulkImportInstitutions
6
+ def initialize(min_id=0, max_id=10000)
7
+ @range = (min_id..max_id).to_a
8
+ @agent = CricosScrape.agent
9
+ end
10
+
11
+ def perform
12
+ @range.each do |provider_id|
13
+ scrape(provider_id)
14
+ end
15
+ end
16
+
17
+ private
18
+
19
+ attr_reader :min_id, :max_id, :agent
20
+
21
+ def scrape(provider_id)
22
+ institution = InstitutionImporter.new(agent, provider_id: provider_id).run
23
+
24
+ if institution
25
+ puts institution.to_json
26
+ else
27
+ STDERR.puts "Could not find institution with Provider ID #{provider_id}"
28
+ end
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,22 @@
1
+ require_relative './importer/contact_importer'
2
+ require_relative './agent'
3
+
4
+ module CricosScrape
5
+ class ImportContacts
6
+ def initialize
7
+ @agent = CricosScrape.agent
8
+ end
9
+
10
+ def perform
11
+ contacts = ContactImporter.new(@agent).run
12
+
13
+ if contacts.any?
14
+ contacts.each do |contact|
15
+ puts contact.to_json
16
+ end
17
+ else
18
+ STDERR.puts "Something not right, there is no Contacts returned"
19
+ end
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,11 @@
1
+ class Struct
2
+ def to_map
3
+ map = Hash.new
4
+ self.members.each { |m| map[m] = self[m] }
5
+ map
6
+ end
7
+
8
+ def to_json(*a)
9
+ to_map.to_json(*a)
10
+ end
11
+ end
@@ -0,0 +1,3 @@
1
+ module CricosScrape
2
+ VERSION = 2.0
3
+ end
@@ -0,0 +1,8 @@
1
+ require 'rubygems'
2
+ require 'json'
3
+ require 'json/add/core'
4
+
5
+ require_relative 'cricos_scrape/json_struct'
6
+ require_relative 'cricos_scrape/bulk_import_institutions'
7
+ require_relative 'cricos_scrape/bulk_import_courses'
8
+ require_relative 'cricos_scrape/import_contacts'
@@ -0,0 +1,76 @@
1
+ require 'spec_helper'
2
+
3
+ describe CricosScrape::ContactImporter do
4
+
5
+ describe '#run' do
6
+ let(:agent) { CricosScrape.agent }
7
+ let(:importer) { CricosScrape::ContactImporter.new(agent) }
8
+ before do
9
+ stub_const('CricosScrape::ContactImporter::STATES_CODE', ['ACT', 'WA'])
10
+
11
+ allow(importer).to receive(:url_for).with('ACT').and_return(contact_details_of_state_act_uri)
12
+ allow(importer).to receive(:url_for).with('WA').and_return(contact_details_of_state_wa_uri)
13
+
14
+ @contacts = importer.run
15
+ end
16
+
17
+ context 'when the response body contains with states ACT and WA' do
18
+ it 'returns array contacts array' do
19
+ data = [
20
+ #contacts of ACT
21
+ CricosScrape::Contact.new('School Courses (and ELICOS and Foundation Programs where delivered by a school)',
22
+ 'Ms Rebecca Hughes',
23
+ 'ACT Education and Training Directorate',
24
+ CricosScrape::Address.new('GPO Box 158', nil, 'CANBERRA', 'ACT', '2601'),
25
+ '0262059299',
26
+ '',
27
+ 'etd.contactus@act.gov.au'
28
+ ),
29
+ CricosScrape::Contact.new('Vocational Courses (and ELICOS courses offered by an RTO or remaining ‘stand-alone’ ELICOS provider)',
30
+ 'ASQA Info Line',
31
+ 'Australian Skills Quality Authority',
32
+ CricosScrape::Address.new('PO Box 9928', nil, 'Melbourne', 'VIC', '3001'),
33
+ '1300701801',
34
+ '',
35
+ 'enquiries@asqa.gov.au'
36
+ ),
37
+ CricosScrape::Contact.new('Higher Education Courses (and ELICOS and Foundation Programs where delivered in a pathway arrangement with a Higher Education Provider)',
38
+ 'Tertiary Education Quality and Standards Agency',
39
+ 'Tertiary Education Quality and Standards Agency',
40
+ CricosScrape::Address.new('GPO Box 1672', nil, 'Melbourne', 'VIC', '3001'),
41
+ '1300739585',
42
+ '1300739586',
43
+ 'enquiries@teqsa.gov.au'
44
+ ),
45
+ #contacts of WA
46
+ CricosScrape::Contact.new('Vocational Courses (and ELICOS courses offered by an RTO or remaining ‘stand-alone’ ELICOS provider)',
47
+ 'ASQA Info Line',
48
+ 'Australian Skills Quality Authority',
49
+ CricosScrape::Address.new('PO Box 9928', nil, 'Melbourne', 'VIC', '3001'),
50
+ '1300701801',
51
+ '',
52
+ 'enquiries@asqa.gov.au'
53
+ ),
54
+ CricosScrape::Contact.new('School Courses (and ELICOS and Foundation Programs where delivered by a school)',
55
+ 'Mr Steve Page Senior Registration and Policy Officer',
56
+ 'Department of Education Services, Non-Government & International Education Directorate',
57
+ CricosScrape::Address.new('PO Box 1766', nil, 'OSBORNE PARK', 'WA', '6916'),
58
+ '0894411962',
59
+ '0894411901',
60
+ 'ngs@des.wa.gov.au'
61
+ ),
62
+ CricosScrape::Contact.new('Higher Education Courses (and ELICOS and Foundation Programs where delivered in a pathway arrangement with a Higher Education Provider)',
63
+ 'Tertiary Education Quality and Standards Agency',
64
+ 'Tertiary Education Quality and Standards Agency',
65
+ CricosScrape::Address.new('GPO Box 1672', nil, 'Melbourne', 'VIC', '3001'),
66
+ '1300739585',
67
+ '1300739586',
68
+ 'enquiries@teqsa.gov.au'
69
+ ),
70
+ ]
71
+
72
+ expect(@contacts).to eq data
73
+ end
74
+ end
75
+ end
76
+ end
@@ -0,0 +1,71 @@
1
+ require 'spec_helper'
2
+
3
+ describe CricosScrape::CourseImporter do
4
+
5
+ describe '#run' do
6
+ let(:agent) { CricosScrape.agent }
7
+ subject(:course) { CricosScrape::CourseImporter.new(agent, course_id: 1).run }
8
+
9
+ before do
10
+ allow_any_instance_of(CricosScrape::CourseImporter).to receive(:url).and_return(uri)
11
+ course_list_page_1 = agent.get("#{uri}?LocationID=123")
12
+ course_list_page_2 = agent.get("#{uri}?LocationID=456")
13
+ allow_any_instance_of(Mechanize::Form).to receive(:submit).with(nil, {'action' => 'get-location-id'}).and_return(course_list_page_1, course_list_page_2)
14
+ end
15
+
16
+ context 'when there is no course found' do
17
+ let(:uri) { not_found_course_details_uri }
18
+
19
+ it 'does not import' do
20
+ expect(course).to be_nil
21
+ end
22
+ end
23
+
24
+ context 'when the details course is found' do
25
+ let(:uri) { course_details_without_pagination_uri }
26
+
27
+ its(:course_id) { is_expected.to eq 1 }
28
+ its(:course_name) { is_expected.to eq 'Primary Yrs K-6' }
29
+ its(:course_code) { is_expected.to eq '012395K' }
30
+ its(:dual_qualification) { is_expected.to eq 'No' }
31
+ its(:field_of_education) { is_expected.to eq '' }
32
+ its(:broad_field) { is_expected.to eq '12 - Mixed Field Programmes' }
33
+ its(:narrow_field) { is_expected.to eq '1201 - General Education Programmes' }
34
+ its(:detailed_field) { is_expected.to eq '120101 - General Primary and Secondary Education Programmes' }
35
+ its(:course_level) { is_expected.to eq 'Primary School Studies' }
36
+ its(:foundation_studies) { is_expected.to eq 'No' }
37
+ its(:work_component) { is_expected.to eq 'No' }
38
+ its(:course_language) { is_expected.to eq 'English' }
39
+ its(:duration) { is_expected.to eq '364' }
40
+ its(:total_cost) { is_expected.to eq '66,500' }
41
+ its(:contact_officers) do
42
+ contact_officers = [
43
+ CricosScrape::ContactOfficer.new('Principal Executive Officer', 'Nicole King', 'Manager', '0262056998', '62059239', nil),
44
+ CricosScrape::ContactOfficer.new('International Student Contact', 'PAUL Wang', 'Study Tour Coordinator', '62077293', '', 'paul.wang@act.gov.au'),
45
+ ]
46
+
47
+ is_expected.to eq contact_officers
48
+ end
49
+ end
50
+
51
+ context 'when the response body not contains pagination location' do
52
+ let(:uri) { course_details_without_pagination_uri }
53
+
54
+ its(:location_ids) do
55
+ location_ids = ["123", "456"]
56
+ is_expected.to eq location_ids
57
+ end
58
+ end
59
+
60
+ context 'when the contact officers contains table grid' do
61
+ let(:uri) { course_details_with_contact_officers_table_grid }
62
+ let(:data) { [CricosScrape::ContactOfficer.new('Principal Executive Officer', 'Andrew Vann', 'Vice-Chancellor', '02 6338 4209', '02 6338 4809', nil),
63
+ CricosScrape::ContactOfficer.new('International Student Contact', 'Matthew Evans', nil, '02 63657537', '02 63657590', 'mevans@csu.edu.au'),
64
+ CricosScrape::ContactOfficer.new('International Student Contact', 'Matthew Evans', nil, '02 6365 7537', '02 6365 7590', 'mevans@csu.edu.au')] }
65
+
66
+ its(:contact_officers) do
67
+ is_expected.to eq data
68
+ end
69
+ end
70
+ end
71
+ end