cricos_scrape 2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (34) hide show
  1. checksums.yaml +7 -0
  2. data/CONTRIBUTING.md +51 -0
  3. data/Gemfile +2 -0
  4. data/Gemfile.lock +64 -0
  5. data/LICENSE.md +22 -0
  6. data/Procfile +3 -0
  7. data/README.md +40 -0
  8. data/Rakefile +13 -0
  9. data/cricos_scrape.gemspec +31 -0
  10. data/lib/cricos_scrape/agent.rb +9 -0
  11. data/lib/cricos_scrape/bulk_import_courses.rb +31 -0
  12. data/lib/cricos_scrape/bulk_import_institutions.rb +31 -0
  13. data/lib/cricos_scrape/import_contacts.rb +22 -0
  14. data/lib/cricos_scrape/json_struct.rb +11 -0
  15. data/lib/cricos_scrape/version.rb +3 -0
  16. data/lib/cricos_scrape.rb +8 -0
  17. data/spec/contact_importer_spec.rb +76 -0
  18. data/spec/course_importer_spec.rb +71 -0
  19. data/spec/fixtures/contact_details_of_state_act_uri.html +546 -0
  20. data/spec/fixtures/contact_details_of_state_wa_uri.html +546 -0
  21. data/spec/fixtures/course_details_with_contact_officers_table_grid.html +467 -0
  22. data/spec/fixtures/course_details_without_pagination_uri.html +470 -0
  23. data/spec/fixtures/courses_list_by_location_id_uri.html +174 -0
  24. data/spec/fixtures/institution_details_with_pagination_location_page_1_uri.html +406 -0
  25. data/spec/fixtures/institution_details_with_pagination_location_page_2_uri.html +358 -0
  26. data/spec/fixtures/institution_details_with_po_box_postal_address.html +240 -0
  27. data/spec/fixtures/institution_details_with_trading_name.html +322 -0
  28. data/spec/fixtures/institution_details_without_locations_details_uri.html +151 -0
  29. data/spec/fixtures/institution_details_without_pagination_location_uri.html +299 -0
  30. data/spec/fixtures/not_found_course_details_uri.html +837 -0
  31. data/spec/fixtures/not_found_institution_details.html +36 -0
  32. data/spec/institution_importer_spec.rb +138 -0
  33. data/spec/spec_helper.rb +67 -0
  34. metadata +190 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 2cb41efd407804298343699766a9d530e4d51caf
4
+ data.tar.gz: 8ec6ecfb82d9ce2c2171297b8ef3026008e0eaaa
5
+ SHA512:
6
+ metadata.gz: 47b66ae22e51f5846aa2aaac0bb4e93f4ceb1ce94d4ae81dab0179b0eaa95ed399d64bdf551c6a1416fac8059119fbd78292f9e963a9caf7364b5dcf5a090844
7
+ data.tar.gz: e2c8f522a4444ae73feff3d6ccc40c8c55017c1aa64ceec48cf12176c56e7876597f679cc6bcdfe8ba9a2a29182ad8746d7b2254b90321f09a975f3af4d6eecf
data/CONTRIBUTING.md ADDED
@@ -0,0 +1,51 @@
1
+ Lotus is an open source project and we would love you to help us make it better.
2
+
3
+ ## Reporting Issues
4
+
5
+ A well formatted issue is appreciated, and goes a long way in helping us help you.
6
+
7
+ * Make sure you have a [GitHub account](https://github.com/signup/free)
8
+ * Submit a [Github issue](./issues) by:
9
+ * Clearly describing the issue
10
+ * Provide a descriptive summary
11
+ * Explain the expected behavior
12
+ * Explain the actual behavior
13
+ * Provide steps to reproduce the actual behavior
14
+ * Provide your application's complete `Gemfile.lock` as text (in a [Gist](https://gist.github.com) for bonus points)
15
+ * Any relevant stack traces
16
+
17
+ If you provide code, make sure it is formatted with the triple backticks (\`).
18
+
19
+ At this point, we'd love to tell you how long it will take for us to respond,
20
+ but we just don't know.
21
+
22
+ ## Pull requests
23
+
24
+ We accept pull requests to Lotus for:
25
+
26
+ * Adding documentation
27
+ * Fixing bugs
28
+ * Adding new features
29
+
30
+ Not all features proposed will be added but we are open to having a conversation
31
+ about a feature you are championing.
32
+
33
+ Here's a quick guide:
34
+
35
+ 1. Fork the repo.
36
+
37
+ 2. Run the tests. This is to make sure your starting point works. Tests can be
38
+ run via `rake`
39
+
40
+ 3. Create a new branch and make your changes. This includes tests for features!
41
+
42
+ 4. Push to your fork and submit a pull request. For more information, see
43
+ [Github's pull request help section](https://help.github.com/articles/using-pull-requests/).
44
+
45
+ At this point you're waiting on us. Expect a conversation regarding your pull
46
+ request; Questions, clarifications, and so on.
47
+
48
+ Some things that will increase the chance that your pull request is accepted:
49
+
50
+ * Include tests that fail without your code, and pass with it
51
+ * Update the documentation
data/Gemfile ADDED
@@ -0,0 +1,2 @@
1
+ source 'https://rubygems.org'
2
+ gemspec
data/Gemfile.lock ADDED
@@ -0,0 +1,64 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ cricos_scrape (2.0)
5
+ mechanize (~> 2.7, >= 2.7.2)
6
+ slop (~> 4.2.0, >= 4.2.0)
7
+
8
+ GEM
9
+ remote: https://rubygems.org/
10
+ specs:
11
+ diff-lcs (1.2.5)
12
+ domain_name (0.5.25)
13
+ unf (>= 0.0.5, < 1.0.0)
14
+ http-cookie (1.0.2)
15
+ domain_name (~> 0.5)
16
+ mechanize (2.7.3)
17
+ domain_name (~> 0.5, >= 0.5.1)
18
+ http-cookie (~> 1.0)
19
+ mime-types (~> 2.0)
20
+ net-http-digest_auth (~> 1.1, >= 1.1.1)
21
+ net-http-persistent (~> 2.5, >= 2.5.2)
22
+ nokogiri (~> 1.4)
23
+ ntlm-http (~> 0.1, >= 0.1.1)
24
+ webrobots (>= 0.0.9, < 0.2)
25
+ mime-types (2.6.2)
26
+ mini_portile (0.6.2)
27
+ net-http-digest_auth (1.4)
28
+ net-http-persistent (2.9.4)
29
+ nokogiri (1.6.6.2)
30
+ mini_portile (~> 0.6.0)
31
+ ntlm-http (0.1.1)
32
+ rspec (3.3.0)
33
+ rspec-core (~> 3.3.0)
34
+ rspec-expectations (~> 3.3.0)
35
+ rspec-mocks (~> 3.3.0)
36
+ rspec-core (3.3.2)
37
+ rspec-support (~> 3.3.0)
38
+ rspec-expectations (3.3.1)
39
+ diff-lcs (>= 1.2.0, < 2.0)
40
+ rspec-support (~> 3.3.0)
41
+ rspec-its (1.2.0)
42
+ rspec-core (>= 3.0.0)
43
+ rspec-expectations (>= 3.0.0)
44
+ rspec-mocks (3.3.2)
45
+ diff-lcs (>= 1.2.0, < 2.0)
46
+ rspec-support (~> 3.3.0)
47
+ rspec-support (3.3.0)
48
+ slop (4.2.0)
49
+ unf (0.1.4)
50
+ unf_ext
51
+ unf_ext (0.0.7.1)
52
+ webrobots (0.1.1)
53
+
54
+ PLATFORMS
55
+ ruby
56
+
57
+ DEPENDENCIES
58
+ bundler (~> 1.6)
59
+ cricos_scrape!
60
+ rspec (~> 3.3.0, >= 3.3.0)
61
+ rspec-its (~> 1.2.0, >= 1.2.0)
62
+
63
+ BUNDLED WITH
64
+ 1.10.6
data/LICENSE.md ADDED
@@ -0,0 +1,22 @@
1
+ Copyright © 2014-2015 Trung Lê
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/Procfile ADDED
@@ -0,0 +1,3 @@
1
+ job_1: bundle exec rake import:institutions MIN_ID=1 MAX_ID=10000
2
+ job_2: bundle exec rake import:courses
3
+ job_3: bundle exec rake import:contacts OVERWRITE=true
data/README.md ADDED
@@ -0,0 +1,40 @@
1
+ [![Build Status](https://travis-ci.org/ruby-journal/cricos_scrape.rb.svg)](https://travis-ci.org/ruby-journal/cricos_scrape.rb)
2
+
3
+
4
+ # CRICOS Scraper
5
+
6
+ ![CRICOS Logo](http://cricos.education.gov.au/images/cricos.gif)
7
+
8
+ CRICOS lacks API for data retrieval (so are many government-based services). This gem
9
+ helps scrape data from [http://cricos.education.gov.au](http://cricos.education.gov.au).
10
+
11
+ This gem supports Ruby 2.2.3+ only.
12
+
13
+ # Features
14
+
15
+ Support scrapping following entities:
16
+
17
+ * Institution
18
+ * Course
19
+ * Contact
20
+
21
+ # Installation
22
+
23
+ ```
24
+ gem install cricos_scrape
25
+ ```
26
+
27
+ # Usage
28
+
29
+ Please consult `cricos_scrape -h` command line.
30
+
31
+ # Testing
32
+
33
+ The tests are in the spec directory. Here syntax to test this gem.
34
+ ```
35
+ rspec
36
+ ```
37
+
38
+ # License
39
+
40
+ MIT
data/Rakefile ADDED
@@ -0,0 +1,13 @@
1
+ require 'rspec/core/rake_task'
2
+ RSpec::Core::RakeTask.new
3
+
4
+ require_relative 'lib/cricos_scrape'
5
+ namespace :import do
6
+
7
+
8
+ task :contacts do
9
+ output_file = ENV['OUTPUT_FILE'] || 'contacts.json'
10
+ CricosScrape::BulkImportContacts::new(output_file, ENV['OVERWRITE']).perform
11
+ end
12
+
13
+ end
@@ -0,0 +1,31 @@
1
+ require './lib/cricos_scrape/version'
2
+
3
+ Gem::Specification.new do |spec|
4
+ spec.name = 'cricos_scrape'
5
+ spec.version = CricosScrape::VERSION
6
+ spec.authors = ['Trung Lê', 'Toàn Lê']
7
+ spec.email = ['trung.le@ruby-journal.com', 'ktoanlba@gmail.com']
8
+ spec.summary = %q{CRICOS Scrape}
9
+ spec.description = %q{Scrape Institutions, Courses, Contacts from CRICOS}
10
+ spec.homepage = 'https://github.com/ruby-journal/cricos_scrape.rb'
11
+ spec.license = 'MIT'
12
+
13
+ spec.files = Dir['[A-Z]*',
14
+ 'lib/*.rb',
15
+ 'lib/cricos_scrape/*.rb',
16
+ 'spec/*.rb',
17
+ 'spec/fixtures/*.html']
18
+
19
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
20
+ spec.test_files = spec.files.grep(%r{^(spec)/})
21
+ spec.required_ruby_version = '>= 2.2.2'
22
+
23
+ spec.require_paths = ['lib']
24
+
25
+ spec.add_development_dependency 'bundler', '~> 1.6'
26
+ spec.add_development_dependency 'rspec', '~> 3.3.0', '>= 3.3.0'
27
+ spec.add_development_dependency 'rspec-its', '~> 1.2.0', '>= 1.2.0'
28
+
29
+ spec.add_runtime_dependency 'mechanize', '~> 2.7', '>= 2.7.2'
30
+ spec.add_runtime_dependency 'slop', '~> 4.2.0', '>= 4.2.0'
31
+ end
@@ -0,0 +1,9 @@
1
+ require 'mechanize'
2
+
3
+ module CricosScrape
4
+ def self.agent
5
+ agent = Mechanize.new
6
+ agent.user_agent = Mechanize::AGENT_ALIASES['Windows IE 6']
7
+ agent
8
+ end
9
+ end
@@ -0,0 +1,31 @@
1
+ require_relative './importer/course_importer'
2
+ require_relative './agent'
3
+
4
+ module CricosScrape
5
+ class BulkImportCourses
6
+ def initialize(min_id=0, max_id=10000)
7
+ @range = (min_id..max_id).to_a
8
+ @agent = CricosScrape.agent
9
+ end
10
+
11
+ def perform
12
+ @range.each do |course_id|
13
+ scrape(course_id)
14
+ end
15
+ end
16
+
17
+ private
18
+
19
+ attr_reader :min_id, :max_id, :input, :agent
20
+
21
+ def scrape(course_id)
22
+ course = CourseImporter.new(agent, course_id: course_id).run
23
+
24
+ if course
25
+ puts course.to_json
26
+ else
27
+ STDERR.puts "Could not find course with Course ID #{course_id}"
28
+ end
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,31 @@
1
+ require_relative './importer/institution_importer'
2
+ require_relative './agent'
3
+
4
+ module CricosScrape
5
+ class BulkImportInstitutions
6
+ def initialize(min_id=0, max_id=10000)
7
+ @range = (min_id..max_id).to_a
8
+ @agent = CricosScrape.agent
9
+ end
10
+
11
+ def perform
12
+ @range.each do |provider_id|
13
+ scrape(provider_id)
14
+ end
15
+ end
16
+
17
+ private
18
+
19
+ attr_reader :min_id, :max_id, :agent
20
+
21
+ def scrape(provider_id)
22
+ institution = InstitutionImporter.new(agent, provider_id: provider_id).run
23
+
24
+ if institution
25
+ puts institution.to_json
26
+ else
27
+ STDERR.puts "Could not find institution with Provider ID #{provider_id}"
28
+ end
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,22 @@
1
+ require_relative './importer/contact_importer'
2
+ require_relative './agent'
3
+
4
+ module CricosScrape
5
+ class ImportContacts
6
+ def initialize
7
+ @agent = CricosScrape.agent
8
+ end
9
+
10
+ def perform
11
+ contacts = ContactImporter.new(@agent).run
12
+
13
+ if contacts.any?
14
+ contacts.each do |contact|
15
+ puts contact.to_json
16
+ end
17
+ else
18
+ STDERR.puts "Something not right, there is no Contacts returned"
19
+ end
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,11 @@
1
+ class Struct
2
+ def to_map
3
+ map = Hash.new
4
+ self.members.each { |m| map[m] = self[m] }
5
+ map
6
+ end
7
+
8
+ def to_json(*a)
9
+ to_map.to_json(*a)
10
+ end
11
+ end
@@ -0,0 +1,3 @@
1
+ module CricosScrape
2
+ VERSION = 2.0
3
+ end
@@ -0,0 +1,8 @@
1
+ require 'rubygems'
2
+ require 'json'
3
+ require 'json/add/core'
4
+
5
+ require_relative 'cricos_scrape/json_struct'
6
+ require_relative 'cricos_scrape/bulk_import_institutions'
7
+ require_relative 'cricos_scrape/bulk_import_courses'
8
+ require_relative 'cricos_scrape/import_contacts'
@@ -0,0 +1,76 @@
1
+ require 'spec_helper'
2
+
3
+ describe CricosScrape::ContactImporter do
4
+
5
+ describe '#run' do
6
+ let(:agent) { CricosScrape.agent }
7
+ let(:importer) { CricosScrape::ContactImporter.new(agent) }
8
+ before do
9
+ stub_const('CricosScrape::ContactImporter::STATES_CODE', ['ACT', 'WA'])
10
+
11
+ allow(importer).to receive(:url_for).with('ACT').and_return(contact_details_of_state_act_uri)
12
+ allow(importer).to receive(:url_for).with('WA').and_return(contact_details_of_state_wa_uri)
13
+
14
+ @contacts = importer.run
15
+ end
16
+
17
+ context 'when the response body contains with states ACT and WA' do
18
+ it 'returns array contacts array' do
19
+ data = [
20
+ #contacts of ACT
21
+ CricosScrape::Contact.new('School Courses (and ELICOS and Foundation Programs where delivered by a school)',
22
+ 'Ms Rebecca Hughes',
23
+ 'ACT Education and Training Directorate',
24
+ CricosScrape::Address.new('GPO Box 158', nil, 'CANBERRA', 'ACT', '2601'),
25
+ '0262059299',
26
+ '',
27
+ 'etd.contactus@act.gov.au'
28
+ ),
29
+ CricosScrape::Contact.new('Vocational Courses (and ELICOS courses offered by an RTO or remaining ‘stand-alone’ ELICOS provider)',
30
+ 'ASQA Info Line',
31
+ 'Australian Skills Quality Authority',
32
+ CricosScrape::Address.new('PO Box 9928', nil, 'Melbourne', 'VIC', '3001'),
33
+ '1300701801',
34
+ '',
35
+ 'enquiries@asqa.gov.au'
36
+ ),
37
+ CricosScrape::Contact.new('Higher Education Courses (and ELICOS and Foundation Programs where delivered in a pathway arrangement with a Higher Education Provider)',
38
+ 'Tertiary Education Quality and Standards Agency',
39
+ 'Tertiary Education Quality and Standards Agency',
40
+ CricosScrape::Address.new('GPO Box 1672', nil, 'Melbourne', 'VIC', '3001'),
41
+ '1300739585',
42
+ '1300739586',
43
+ 'enquiries@teqsa.gov.au'
44
+ ),
45
+ #contacts of WA
46
+ CricosScrape::Contact.new('Vocational Courses (and ELICOS courses offered by an RTO or remaining ‘stand-alone’ ELICOS provider)',
47
+ 'ASQA Info Line',
48
+ 'Australian Skills Quality Authority',
49
+ CricosScrape::Address.new('PO Box 9928', nil, 'Melbourne', 'VIC', '3001'),
50
+ '1300701801',
51
+ '',
52
+ 'enquiries@asqa.gov.au'
53
+ ),
54
+ CricosScrape::Contact.new('School Courses (and ELICOS and Foundation Programs where delivered by a school)',
55
+ 'Mr Steve Page Senior Registration and Policy Officer',
56
+ 'Department of Education Services, Non-Government & International Education Directorate',
57
+ CricosScrape::Address.new('PO Box 1766', nil, 'OSBORNE PARK', 'WA', '6916'),
58
+ '0894411962',
59
+ '0894411901',
60
+ 'ngs@des.wa.gov.au'
61
+ ),
62
+ CricosScrape::Contact.new('Higher Education Courses (and ELICOS and Foundation Programs where delivered in a pathway arrangement with a Higher Education Provider)',
63
+ 'Tertiary Education Quality and Standards Agency',
64
+ 'Tertiary Education Quality and Standards Agency',
65
+ CricosScrape::Address.new('GPO Box 1672', nil, 'Melbourne', 'VIC', '3001'),
66
+ '1300739585',
67
+ '1300739586',
68
+ 'enquiries@teqsa.gov.au'
69
+ ),
70
+ ]
71
+
72
+ expect(@contacts).to eq data
73
+ end
74
+ end
75
+ end
76
+ end
@@ -0,0 +1,71 @@
1
+ require 'spec_helper'
2
+
3
+ describe CricosScrape::CourseImporter do
4
+
5
+ describe '#run' do
6
+ let(:agent) { CricosScrape.agent }
7
+ subject(:course) { CricosScrape::CourseImporter.new(agent, course_id: 1).run }
8
+
9
+ before do
10
+ allow_any_instance_of(CricosScrape::CourseImporter).to receive(:url).and_return(uri)
11
+ course_list_page_1 = agent.get("#{uri}?LocationID=123")
12
+ course_list_page_2 = agent.get("#{uri}?LocationID=456")
13
+ allow_any_instance_of(Mechanize::Form).to receive(:submit).with(nil, {'action' => 'get-location-id'}).and_return(course_list_page_1, course_list_page_2)
14
+ end
15
+
16
+ context 'when there is no course found' do
17
+ let(:uri) { not_found_course_details_uri }
18
+
19
+ it 'does not import' do
20
+ expect(course).to be_nil
21
+ end
22
+ end
23
+
24
+ context 'when the details course is found' do
25
+ let(:uri) { course_details_without_pagination_uri }
26
+
27
+ its(:course_id) { is_expected.to eq 1 }
28
+ its(:course_name) { is_expected.to eq 'Primary Yrs K-6' }
29
+ its(:course_code) { is_expected.to eq '012395K' }
30
+ its(:dual_qualification) { is_expected.to eq 'No' }
31
+ its(:field_of_education) { is_expected.to eq '' }
32
+ its(:broad_field) { is_expected.to eq '12 - Mixed Field Programmes' }
33
+ its(:narrow_field) { is_expected.to eq '1201 - General Education Programmes' }
34
+ its(:detailed_field) { is_expected.to eq '120101 - General Primary and Secondary Education Programmes' }
35
+ its(:course_level) { is_expected.to eq 'Primary School Studies' }
36
+ its(:foundation_studies) { is_expected.to eq 'No' }
37
+ its(:work_component) { is_expected.to eq 'No' }
38
+ its(:course_language) { is_expected.to eq 'English' }
39
+ its(:duration) { is_expected.to eq '364' }
40
+ its(:total_cost) { is_expected.to eq '66,500' }
41
+ its(:contact_officers) do
42
+ contact_officers = [
43
+ CricosScrape::ContactOfficer.new('Principal Executive Officer', 'Nicole King', 'Manager', '0262056998', '62059239', nil),
44
+ CricosScrape::ContactOfficer.new('International Student Contact', 'PAUL Wang', 'Study Tour Coordinator', '62077293', '', 'paul.wang@act.gov.au'),
45
+ ]
46
+
47
+ is_expected.to eq contact_officers
48
+ end
49
+ end
50
+
51
+ context 'when the response body not contains pagination location' do
52
+ let(:uri) { course_details_without_pagination_uri }
53
+
54
+ its(:location_ids) do
55
+ location_ids = ["123", "456"]
56
+ is_expected.to eq location_ids
57
+ end
58
+ end
59
+
60
+ context 'when the contact officers contains table grid' do
61
+ let(:uri) { course_details_with_contact_officers_table_grid }
62
+ let(:data) { [CricosScrape::ContactOfficer.new('Principal Executive Officer', 'Andrew Vann', 'Vice-Chancellor', '02 6338 4209', '02 6338 4809', nil),
63
+ CricosScrape::ContactOfficer.new('International Student Contact', 'Matthew Evans', nil, '02 63657537', '02 63657590', 'mevans@csu.edu.au'),
64
+ CricosScrape::ContactOfficer.new('International Student Contact', 'Matthew Evans', nil, '02 6365 7537', '02 6365 7590', 'mevans@csu.edu.au')] }
65
+
66
+ its(:contact_officers) do
67
+ is_expected.to eq data
68
+ end
69
+ end
70
+ end
71
+ end