mwcrawler 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 535ef5f765febcd8e34046c5913006148e354e4c9d4735709ad4bba9fc30b058
4
+ data.tar.gz: f8014655d47a46e88bfeeccca72557341701685402182e6b0b878abfd362e534
5
+ SHA512:
6
+ metadata.gz: 9a2855da03d0feaa6386112e4ede645a5638a2f1d68628f115012220c81d4672959c358bd496853e1a10ce473a493e29cb0f2012803f4c484677ebd4df36b738
7
+ data.tar.gz: 34b27e7cbd83e191fb0065ba80b1ee62adcf7086a77e6e690e3fc426d958584420ed7dad110d18a55b402a1fba5408b3895399e7b0c38745332c9dc06fcd2594
@@ -0,0 +1,14 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /_yardoc/
4
+ /coverage/
5
+ /doc/
6
+ /pkg/
7
+ /spec/reports/
8
+ /tmp/
9
+ /.vscode
10
+ *.gem
11
+
12
+ # rspec failure tracking
13
+ .rspec_status
14
+ *.json
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --color
2
+ --require spec_helper
@@ -0,0 +1,11 @@
1
+ AllCops:
2
+ Exclude:
3
+ - vendor/bundle/**/*
4
+ - '*.gemspec'
5
+
6
+ Metrics/LineLength:
7
+ Max: 120
8
+
9
+ Metrics/BlockLength:
10
+ Exclude:
11
+ - 'spec/**/*'
@@ -0,0 +1 @@
1
+ ruby-2.5.1
@@ -0,0 +1,11 @@
1
+ os: linux
2
+ rvm: 2.5.1
3
+ dist: trusty
4
+ sudo: false
5
+ cache: bundler
6
+ before_install:
7
+ - gem install bundler -v 1.16.1 --no-document
8
+ - bundle install
9
+ script:
10
+ - bundle exec rubocop
11
+ - bundle exec rspec
data/Gemfile ADDED
@@ -0,0 +1,8 @@
1
+ # frozen_string_literal: true
2
+
3
+ source 'https://rubygems.org'
4
+
5
+ git_source(:github) { |repo_name| "https://github.com/#{repo_name}" }
6
+
7
+ # Specify your gem's dependencies in mwcrawler.gemspec
8
+ gemspec
@@ -0,0 +1,95 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ mwcrawler (0.1.0)
5
+ nokogiri (~> 1.8)
6
+
7
+ GEM
8
+ remote: https://rubygems.org/
9
+ specs:
10
+ addressable (2.5.2)
11
+ public_suffix (>= 2.0.2, < 4.0)
12
+ ast (2.4.0)
13
+ coderay (1.1.2)
14
+ coveralls (0.8.22)
15
+ json (>= 1.8, < 3)
16
+ simplecov (~> 0.16.1)
17
+ term-ansicolor (~> 1.3)
18
+ thor (~> 0.19.4)
19
+ tins (~> 1.6)
20
+ crack (0.4.3)
21
+ safe_yaml (~> 1.0.0)
22
+ diff-lcs (1.3)
23
+ docile (1.3.1)
24
+ hashdiff (0.3.7)
25
+ jaro_winkler (1.5.1)
26
+ json (2.1.0)
27
+ method_source (0.9.0)
28
+ mini_portile2 (2.4.0)
29
+ nokogiri (1.10.9)
30
+ mini_portile2 (~> 2.4.0)
31
+ parallel (1.12.1)
32
+ parser (2.5.1.2)
33
+ ast (~> 2.4.0)
34
+ powerpack (0.1.2)
35
+ pry (0.11.3)
36
+ coderay (~> 1.1.0)
37
+ method_source (~> 0.9.0)
38
+ public_suffix (3.0.3)
39
+ rainbow (3.0.0)
40
+ rake (13.0.1)
41
+ rspec (3.8.0)
42
+ rspec-core (~> 3.8.0)
43
+ rspec-expectations (~> 3.8.0)
44
+ rspec-mocks (~> 3.8.0)
45
+ rspec-core (3.8.0)
46
+ rspec-support (~> 3.8.0)
47
+ rspec-expectations (3.8.1)
48
+ diff-lcs (>= 1.2.0, < 2.0)
49
+ rspec-support (~> 3.8.0)
50
+ rspec-mocks (3.8.0)
51
+ diff-lcs (>= 1.2.0, < 2.0)
52
+ rspec-support (~> 3.8.0)
53
+ rspec-support (3.8.0)
54
+ rubocop (0.59.2)
55
+ jaro_winkler (~> 1.5.1)
56
+ parallel (~> 1.10)
57
+ parser (>= 2.5, != 2.5.1.1)
58
+ powerpack (~> 0.1)
59
+ rainbow (>= 2.2.2, < 4.0)
60
+ ruby-progressbar (~> 1.7)
61
+ unicode-display_width (~> 1.0, >= 1.0.1)
62
+ ruby-progressbar (1.10.0)
63
+ safe_yaml (1.0.4)
64
+ simplecov (0.16.1)
65
+ docile (~> 1.1)
66
+ json (>= 1.8, < 3)
67
+ simplecov-html (~> 0.10.0)
68
+ simplecov-html (0.10.2)
69
+ term-ansicolor (1.7.0)
70
+ tins (~> 1.0)
71
+ thor (0.19.4)
72
+ tins (1.18.0)
73
+ unicode-display_width (1.4.0)
74
+ vcr (4.0.0)
75
+ webmock (3.4.2)
76
+ addressable (>= 2.3.6)
77
+ crack (>= 0.3.2)
78
+ hashdiff
79
+
80
+ PLATFORMS
81
+ ruby
82
+
83
+ DEPENDENCIES
84
+ bundler (~> 1.16)
85
+ coveralls
86
+ mwcrawler!
87
+ pry (~> 0.11)
88
+ rake (~> 13.0)
89
+ rspec (~> 3.0)
90
+ rubocop (~> 0.59.2)
91
+ vcr (~> 4.0)
92
+ webmock (~> 3.4)
93
+
94
+ BUNDLED WITH
95
+ 1.16.6
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2018 vitor pontes
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
@@ -0,0 +1,86 @@
1
+ # Mwcrawler
2
+
3
+ Mwcrawler is a gem for parsing UnB's Matricula Web data into consumable hashes.
4
+
5
+ [![Build Status](https://travis-ci.com/danilodelyima/mwcrawler.svg?branch=master)](https://travis-ci.com/danilodelyima/mwcrawler)
6
+ [![Coverage Status](https://coveralls.io/repos/github/danilodelyima/mwcrawler/badge.svg?branch=master)](https://coveralls.io/github/danilodelyima/mwcrawler?branch=master)
7
+
8
+ ## Installation
9
+
10
+ Add this line to your application's Gemfile:
11
+
12
+ ```ruby
13
+ gem 'mwcrawler'
14
+ ```
15
+
16
+ And then execute:
17
+
18
+ bundle
19
+
20
+ Or install it yourself as:
21
+
22
+ gem install mwcrawler
23
+
24
+ ## Usage
25
+
26
+ First instantiate a new crawler `crawler = Mwcrawler::Crawler.new` then you can crawl like so:
27
+
28
+ ```ruby
29
+ courses_hash = crawler.courses
30
+ # return example
31
+ [{"type"=>"Presencial",
32
+ "code"=>"19",
33
+ "name"=>"ADMINISTRAÇÃO",
34
+ "shift"=>"Diurno",
35
+ "curriculums"=>
36
+ [{"name"=>"Administração",
37
+ "degree"=>"Bacharel",
38
+ "semester_max"=>"8",
39
+ "semester_min"=>"16",
40
+ "credits"=>"200"}]},
41
+ {"type"=>"Presencial",
42
+ "code"=>"701",
43
+ "name"=>"ADMINISTRAÇÃO",
44
+ "shift"=>"Noturno",
45
+ "curriculums"=>
46
+ [{"name"=>"Administração",
47
+ "degree"=>"Bacharel",
48
+ "semester_max"=>"8",
49
+ "semester_min"=>"16",
50
+ "credits"=>"200"}]}
51
+ ]
52
+ ```
53
+
54
+ The crawled campus by default is `:darcy_ribeiro` campus,
55
+ but you can specify another `crawler.classes(:planaltina)`.
56
+
57
+ The available resources are:
58
+
59
+ - `classes`
60
+ - `courses`
61
+ - `departments`
62
+ - `curriculum`
63
+
64
+ While `classes` and `curriculum` take `course_code` as param for crawling, `courses` and `departments` take as params any of the four campuses `:darcy_ribeiro`, `:planaltina`, `:ceilandia` and `:gama`.
65
+
66
+ The utility method `semester` returns the current semester.
67
+
68
+ ## Development
69
+
70
+ After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
71
+
72
+ To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
73
+
74
+ ## Contributing
75
+
76
+ Bug reports and pull requests are welcome on GitHub at https://github.com/danilodelyima/mwcrawler.
77
+
78
+ # Guidelines
79
+
80
+ When developing new features the interface must reflect how much scrapping is necessary. In other
81
+ words, if many pages are crawled the user must call many methods. This way we don't overload method
82
+ with functionalities and the user developer can grasp more easily the cost of scrapping that info.
83
+
84
+ ## License
85
+
86
+ The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
@@ -0,0 +1,8 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'bundler/gem_tasks'
4
+ require 'rspec/core/rake_task'
5
+
6
+ RSpec::Core::RakeTask.new(:spec)
7
+
8
+ task default: :spec
@@ -0,0 +1,14 @@
1
+ 2017-2
2
+
3
+ Total de turmas do Darcy: 7366
4
+ Total de turmas Planaltina: 284
5
+ Total de turmas Ceilândia: 526
6
+ Total de turmas Gama: 480
7
+
8
+
9
+ 2018-1
10
+
11
+ Total de turmas do Darcy: 9779 -- 34 min
12
+ Total de turmas Planaltina: 288 -- 2 min
13
+ Total de turmas Ceilândia: 543 -- 3 min
14
+ Total de turmas Gama: 298 -- 2 min
@@ -0,0 +1,12 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ require 'bundler/setup'
5
+ require 'mwcrawler'
6
+
7
+ # You can add fixtures and/or initialization code here to make experimenting
8
+ # with your gem easier. You can also use a different console, if you like.
9
+
10
+ # (If you use this, don't forget to add pry to your Gemfile!)
11
+ require 'pry'
12
+ Pry.start
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
@@ -0,0 +1,19 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'mwcrawler/version'
4
+ require 'nokogiri'
5
+ require 'pry'
6
+ require 'open-uri'
7
+ require 'json'
8
+
9
+ require 'mwcrawler/classes'
10
+ require 'mwcrawler/courses'
11
+ require 'mwcrawler/departments'
12
+ require 'mwcrawler/subjects'
13
+ require 'mwcrawler/helpers'
14
+ require 'mwcrawler/crawler'
15
+
16
+ module Mwcrawler
17
+ # DOMINIO
18
+ SITE = 'https://matriculaweb.unb.br/'
19
+ end
@@ -0,0 +1,92 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Mwcrawler
4
+ # Scraps Classes by campus
5
+ module Classes
6
+ def self.scrap(department_code)
7
+ courses_links = scrap_courses_links(department_code)
8
+ rows = []
9
+ courses_links.each do |course_link|
10
+ rows += scrap_classes(course_link)
11
+ end
12
+ rows
13
+ end
14
+
15
+ private_class_method def self.scrap_courses_links(department_code)
16
+ page = Helpers.set_crawler(department_code, 'graduacao/oferta_dis.aspx?cod=', exact: true)
17
+ page.css('#datatable tr td:nth-child(2) a')
18
+ .map { |link| link['href'] }
19
+ end
20
+
21
+ private_class_method def self.scrap_classes(course_link)
22
+ rows = []
23
+
24
+ page = Helpers.set_crawler(course_link, 'graduacao/', exact: true)
25
+ page_classes = page.css('.tabela-oferta .turma').map(&:text)
26
+
27
+ page_classes.each_with_index do |cl, i|
28
+ row_init = class_row_init(page, cl)
29
+ rows << scrap_row(row_init, page, i)
30
+ Helpers.log "Total de turmas: #{rows.size}"
31
+ end
32
+ rows
33
+ end
34
+ private_class_method def self.class_row_init(page, name)
35
+ { department: page.css('#datatable tr:first-child a').text,
36
+ code: page.css('#datatable')[0].css('tr:nth-child(2) td').text.to_i,
37
+ course_code: scrap_course_code(page),
38
+ credits: scrap_credit_hash(page),
39
+ name: name }
40
+ end
41
+
42
+ private_class_method def self.scrap_course_code(page)
43
+ course_uri = page.css('#datatable')[0].css('tr:nth-child(3) td a').first['href']
44
+ Helpers.uri_query_params(course_uri)['cod'].to_i
45
+ end
46
+
47
+ private_class_method def self.scrap_credit_hash(page)
48
+ credit_string = page.css('#datatable')[0].css('tr:nth-child(4) td').text
49
+ credits = credit_string.split('-').map(&:to_i)
50
+ { theory: credits[0], practical: credits[1], extension: credits[2], study: credits[3] }
51
+ end
52
+
53
+ private_class_method def self.scrap_row(row_init, page, count)
54
+ row = row_init
55
+ row.merge(scrap_vacancies(page, count))
56
+ # HORARIOS
57
+ row[:schedules] = scrap_schedules(page, count)
58
+ # PROFESSORES
59
+ row[:teachers] = scrap_teachers(page, count)
60
+ row
61
+ end
62
+
63
+ private_class_method def self.scrap_schedules(page, count)
64
+ schedules = page.css('.tabela-oferta')[count]
65
+ .css('tr td:nth-child(4) .table')
66
+ .css('td').map(&:text)
67
+
68
+ Helpers.format_hours(schedules)
69
+ end
70
+
71
+ private_class_method def self.scrap_teachers(page, count)
72
+ teachers = page.css('.tabela-oferta')[count]
73
+ .css('tr td:nth-child(5) td')
74
+ .map(&:text)
75
+
76
+ Helpers.format_teachers(teachers)
77
+ end
78
+
79
+ private_class_method def self.scrap_vacancies(page, count)
80
+ {
81
+ vacancies_total: scrap_vacancy(1, page, count),
82
+ vacancies_occupied: scrap_vacancy(2, page, count),
83
+ vacancies_free: scrap_vacancy(3, page, count)
84
+ }
85
+ end
86
+
87
+ private_class_method def self.scrap_vacancy(vacancy_row, page, count)
88
+ page.css('.tabela-oferta')[count]
89
+ .css(".tabela-vagas tr:nth-child(#{vacancy_row}) td:nth-child(3)").text
90
+ end
91
+ end
92
+ end
@@ -0,0 +1,30 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'mwcrawler/curriculum'
4
+
5
+ module Mwcrawler
6
+ # Scraps Courses by campus
7
+ module Courses
8
+ def self.scrap(campus)
9
+ page = Helpers.set_crawler(campus, 'graduacao/curso_rel.aspx?cod=')
10
+ courses = page.css('#datatable tr td').map(&:text)
11
+
12
+ # CADA CURSO SERA UMA LINHA, ENTAO rows E O CONJUNTO DE TODAS AS TURMAS
13
+ rows = []
14
+ rows << scrap_row(courses) until courses.empty?
15
+ Helpers.log "Total de cursos: #{rows.count}"
16
+
17
+ rows
18
+ end
19
+
20
+ private_class_method def self.scrap_row(courses)
21
+ row = {}
22
+ row['type'] = courses.shift
23
+ row['code'] = courses.shift
24
+ row['name'] = courses.shift
25
+ row['shift'] = courses.shift
26
+ row['curriculums'] = Curriculum.scrap(row['code'])
27
+ row
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,31 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Mwcrawler
4
+ # Main api for crawling
5
+ class Crawler
6
+ include Mwcrawler
7
+
8
+ SCRAPPERS = {
9
+ courses: Courses,
10
+ classes: Classes,
11
+ departments: Departments
12
+ }.freeze
13
+
14
+ SCRAPPERS.keys.each do |method|
15
+ define_method(method) do |campus = :darcy_ribeiro, options = { log: false }|
16
+ Options.init(options)
17
+ SCRAPPERS[method].scrap campus
18
+ end
19
+ end
20
+
21
+ def subjects(department, options = { log: false })
22
+ Options.init(options)
23
+ Subjects.scrap department, options
24
+ end
25
+
26
+ def semester
27
+ page = Helpers.set_crawler(nil, 'graduacao/default.aspx', exact: true)
28
+ page.css("a[title='Período Atual'] span").first.text
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,26 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Mwcrawler
4
+ # Scraps curriculums by course code
5
+ module Curriculum
6
+ def self.scrap(code)
7
+ rows = []
8
+ page = Helpers.set_crawler(code, 'graduacao/curso_dados.aspx?cod=', exact: true)
9
+ curriculums = page.css('.table-responsive h4').map { |item| item.children[0].text }
10
+ page.css('.table-responsive .table').each do |table|
11
+ rows << scrap_row(curriculums.shift, table)
12
+ end
13
+ rows
14
+ end
15
+
16
+ private_class_method def self.scrap_row(curriculum_name, table)
17
+ row = {}
18
+ row['name'] = curriculum_name
19
+ row['degree'] = table.css('tr:first td').text
20
+ row['semester_max'] = table.css('tr:nth-child(2) td').text
21
+ row['semester_min'] = table.css('tr:nth-child(3) td').text
22
+ row['credits'] = table.css('tr:nth-child(4) td').text
23
+ row
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,24 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Mwcrawler
4
+ # Scraps Departments by campus
5
+ module Departments
6
+ def self.scrap(campus)
7
+ page = Helpers.set_crawler(campus, 'graduacao/oferta_dep.aspx?cod=')
8
+ departments = page.css('#datatable tr td').map(&:text)
9
+
10
+ # CADA DEPARTAMENTO SERA UMA LINHA, ENTAO rows E O CONJUNTO DE TODOS OS DEPARTAMENTOS
11
+ rows = []
12
+ rows << scrap_row(departments) until departments.empty?
13
+ rows
14
+ end
15
+
16
+ private_class_method def self.scrap_row(departments)
17
+ row = {}
18
+ row['code'] = departments.shift
19
+ row['acronym'] = departments.shift
20
+ row['name'] = departments.shift
21
+ row
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,76 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Mwcrawler
4
+ # Controls available campuses
5
+ class Campuses
6
+ CAMPUSES = {
7
+ darcy_ribeiro: 1,
8
+ planaltina: 2,
9
+ ceilandia: 3,
10
+ gama: 4
11
+ }.freeze
12
+
13
+ def self.id(campus)
14
+ raise ArgumentError, "Campus: #{campus} not in: #{CAMPUSES.keys}" unless CAMPUSES.include? campus
15
+
16
+ CAMPUSES[campus]
17
+ end
18
+ end
19
+
20
+ # Options module
21
+ module Options
22
+ module_function
23
+
24
+ @log = false
25
+
26
+ def init(options = { log: false })
27
+ @log = options[:log].freeze
28
+ end
29
+
30
+ def log_enabled?
31
+ @log
32
+ end
33
+ end
34
+
35
+ # Helper methods used throughout the lib
36
+ class Helpers
37
+ def self.format_hours(schedules, row = [])
38
+ until schedules.empty?
39
+ schedule = []
40
+ schedule << schedules.shift # DIA
41
+ schedule << schedules.shift # HORARIO DE INICIO
42
+ schedule << schedules.shift # HORARIO DE FIM
43
+ schedules.shift # RETIRANDO LIXO
44
+ schedule << schedules.shift # LOCAL DA AULA
45
+ row << schedule
46
+ end
47
+ row
48
+ end
49
+
50
+ def self.format_teachers(teachers)
51
+ teachers.empty? ? ['A Designar'] : teachers
52
+ end
53
+
54
+ # MODE: TURMAS, CURSOS OU CURRICULO
55
+ def self.set_crawler(id, search_mode, options = { exact: false })
56
+ id = Campuses.id id unless options[:exact]
57
+ url = SITE + search_mode + id.to_s
58
+ Nokogiri::HTML(URI.parse(url).open)
59
+ end
60
+
61
+ def self.write_json(file_name, object)
62
+ File.open(file_name, 'w+') do |f|
63
+ f.write object.to_json
64
+ end
65
+ end
66
+
67
+ def self.log(msg)
68
+ puts msg if Options.log_enabled?
69
+ end
70
+
71
+ def self.uri_query_params(uri)
72
+ query_string = URI.parse(uri).query
73
+ query_string.split('&').map { |param| param.split('=') }.to_h
74
+ end
75
+ end
76
+ end
@@ -0,0 +1,49 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Mwcrawler
4
+ # Scraps Subjects by department
5
+ module Subjects
6
+ def self.scrap(department_or_id, options)
7
+ if options[:by_id]
8
+ subject_by_id(department_or_id)
9
+ elsif options[:by_department]
10
+ subject_by_department(department_or_id)
11
+ else
12
+ raise ArgumentError, 'second argument not specified. You can find a subject by department code or id'
13
+ end
14
+ end
15
+
16
+ private_class_method def self.subject_by_department(department)
17
+ page = Helpers.set_crawler(department, 'graduacao/oferta_dis.aspx?cod=', exact: true)
18
+ scrap_row(department, page)
19
+ end
20
+
21
+ private_class_method def self.subject_by_id(id)
22
+ page = Helpers.set_crawler(id, 'graduacao/oferta_dados.aspx?cod=', exact: true)
23
+ row_init_by_id(page)
24
+ end
25
+
26
+ private_class_method def self.row_init_by_id(page)
27
+ { code: page.css('#datatable')[0].css('tr:nth-child(2) td').text.to_i,
28
+ name: page.css('#datatable')[0].css('tr:nth-child(3) td').text,
29
+ department: page.css('#datatable tr:first-child a').first['href'].scan(/\d+/)[0].to_i,
30
+ level: 'graduação' }
31
+ end
32
+
33
+ private_class_method def self.scrap_row(dep_code, page)
34
+ subjects = []
35
+ length = page.css('#datatable tr td:nth-child(1)').count
36
+ length.times do |i|
37
+ subjects << row_init_by_department(page, dep_code, i)
38
+ end
39
+ subjects
40
+ end
41
+
42
+ private_class_method def self.row_init_by_department(page, dep_code, index)
43
+ { code: page.css('#datatable tr td:nth-child(1)').map(&:text)[index].to_i,
44
+ name: page.css('#datatable tr td:nth-child(2)').map(&:text)[index],
45
+ department: dep_code.to_i,
46
+ level: 'graduação' }
47
+ end
48
+ end
49
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Mwcrawler
4
+ VERSION = '0.1.0'
5
+ end
@@ -0,0 +1,41 @@
1
+ lib = File.expand_path('lib', __dir__)
2
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
3
+ require 'mwcrawler/version'
4
+
5
+ Gem::Specification.new do |spec|
6
+ spec.name = 'mwcrawler'
7
+ spec.version = Mwcrawler::VERSION
8
+ spec.authors = ['Danilo de Lima', 'vitor pontes']
9
+ spec.email = ['vitormax2005@hotmail.com']
10
+
11
+ spec.summary = 'Gema para webscrapping do sistemas de matriculas da unb Matricula Web.'
12
+ spec.description = 'Essa gema provê uma api ruby para se fazer o scrapping de páginas html do sistema matricula web e retornar um conteudo que pode ser mais facilmente processado pelo programa'
13
+ spec.homepage = 'https://github.com/danilodelyima/mwcrawler'
14
+ spec.license = 'MIT'
15
+
16
+ # Prevent pushing this gem to RubyGems.org. To allow pushes either set the 'allowed_push_host'
17
+ # to allow pushing to a single host or delete this section to allow pushing to any host.
18
+ # if spec.respond_to?(:metadata)
19
+ # spec.metadata['allowed_push_host'] = "TODO: Set to 'http://mygemserver.com'"
20
+ # else
21
+ # raise 'RubyGems 2.0 or newer is required to protect against ' \
22
+ # 'public gem pushes.'
23
+ # end
24
+
25
+ spec.files = `git ls-files -z`.split("\x0").reject do |f|
26
+ f.match(%r{^(test|spec|features)/})
27
+ end
28
+ spec.bindir = 'exe'
29
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
30
+ spec.require_paths = ['lib']
31
+
32
+ spec.add_development_dependency 'bundler', '~> 1.16'
33
+ spec.add_development_dependency 'coveralls'
34
+ spec.add_development_dependency 'pry', '~> 0.11'
35
+ spec.add_development_dependency 'rake', '~> 13.0'
36
+ spec.add_development_dependency 'rspec', '~> 3.0'
37
+ spec.add_development_dependency 'vcr', '~> 4.0'
38
+ spec.add_development_dependency 'webmock', '~> 3.4'
39
+ spec.add_development_dependency 'rubocop', '~> 0.59.2'
40
+ spec.add_dependency 'nokogiri', '~> 1.8'
41
+ end
metadata ADDED
@@ -0,0 +1,196 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: mwcrawler
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Danilo de Lima
8
+ - vitor pontes
9
+ autorequire:
10
+ bindir: exe
11
+ cert_chain: []
12
+ date: 2020-05-29 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: bundler
16
+ requirement: !ruby/object:Gem::Requirement
17
+ requirements:
18
+ - - "~>"
19
+ - !ruby/object:Gem::Version
20
+ version: '1.16'
21
+ type: :development
22
+ prerelease: false
23
+ version_requirements: !ruby/object:Gem::Requirement
24
+ requirements:
25
+ - - "~>"
26
+ - !ruby/object:Gem::Version
27
+ version: '1.16'
28
+ - !ruby/object:Gem::Dependency
29
+ name: coveralls
30
+ requirement: !ruby/object:Gem::Requirement
31
+ requirements:
32
+ - - ">="
33
+ - !ruby/object:Gem::Version
34
+ version: '0'
35
+ type: :development
36
+ prerelease: false
37
+ version_requirements: !ruby/object:Gem::Requirement
38
+ requirements:
39
+ - - ">="
40
+ - !ruby/object:Gem::Version
41
+ version: '0'
42
+ - !ruby/object:Gem::Dependency
43
+ name: pry
44
+ requirement: !ruby/object:Gem::Requirement
45
+ requirements:
46
+ - - "~>"
47
+ - !ruby/object:Gem::Version
48
+ version: '0.11'
49
+ type: :development
50
+ prerelease: false
51
+ version_requirements: !ruby/object:Gem::Requirement
52
+ requirements:
53
+ - - "~>"
54
+ - !ruby/object:Gem::Version
55
+ version: '0.11'
56
+ - !ruby/object:Gem::Dependency
57
+ name: rake
58
+ requirement: !ruby/object:Gem::Requirement
59
+ requirements:
60
+ - - "~>"
61
+ - !ruby/object:Gem::Version
62
+ version: '13.0'
63
+ type: :development
64
+ prerelease: false
65
+ version_requirements: !ruby/object:Gem::Requirement
66
+ requirements:
67
+ - - "~>"
68
+ - !ruby/object:Gem::Version
69
+ version: '13.0'
70
+ - !ruby/object:Gem::Dependency
71
+ name: rspec
72
+ requirement: !ruby/object:Gem::Requirement
73
+ requirements:
74
+ - - "~>"
75
+ - !ruby/object:Gem::Version
76
+ version: '3.0'
77
+ type: :development
78
+ prerelease: false
79
+ version_requirements: !ruby/object:Gem::Requirement
80
+ requirements:
81
+ - - "~>"
82
+ - !ruby/object:Gem::Version
83
+ version: '3.0'
84
+ - !ruby/object:Gem::Dependency
85
+ name: vcr
86
+ requirement: !ruby/object:Gem::Requirement
87
+ requirements:
88
+ - - "~>"
89
+ - !ruby/object:Gem::Version
90
+ version: '4.0'
91
+ type: :development
92
+ prerelease: false
93
+ version_requirements: !ruby/object:Gem::Requirement
94
+ requirements:
95
+ - - "~>"
96
+ - !ruby/object:Gem::Version
97
+ version: '4.0'
98
+ - !ruby/object:Gem::Dependency
99
+ name: webmock
100
+ requirement: !ruby/object:Gem::Requirement
101
+ requirements:
102
+ - - "~>"
103
+ - !ruby/object:Gem::Version
104
+ version: '3.4'
105
+ type: :development
106
+ prerelease: false
107
+ version_requirements: !ruby/object:Gem::Requirement
108
+ requirements:
109
+ - - "~>"
110
+ - !ruby/object:Gem::Version
111
+ version: '3.4'
112
+ - !ruby/object:Gem::Dependency
113
+ name: rubocop
114
+ requirement: !ruby/object:Gem::Requirement
115
+ requirements:
116
+ - - "~>"
117
+ - !ruby/object:Gem::Version
118
+ version: 0.59.2
119
+ type: :development
120
+ prerelease: false
121
+ version_requirements: !ruby/object:Gem::Requirement
122
+ requirements:
123
+ - - "~>"
124
+ - !ruby/object:Gem::Version
125
+ version: 0.59.2
126
+ - !ruby/object:Gem::Dependency
127
+ name: nokogiri
128
+ requirement: !ruby/object:Gem::Requirement
129
+ requirements:
130
+ - - "~>"
131
+ - !ruby/object:Gem::Version
132
+ version: '1.8'
133
+ type: :runtime
134
+ prerelease: false
135
+ version_requirements: !ruby/object:Gem::Requirement
136
+ requirements:
137
+ - - "~>"
138
+ - !ruby/object:Gem::Version
139
+ version: '1.8'
140
+ description: Essa gema provê uma api ruby para se fazer o scrapping de páginas html
141
+ do sistema matricula web e retornar um conteudo que pode ser mais facilmente processado
142
+ pelo programa
143
+ email:
144
+ - vitormax2005@hotmail.com
145
+ executables: []
146
+ extensions: []
147
+ extra_rdoc_files: []
148
+ files:
149
+ - ".gitignore"
150
+ - ".rspec"
151
+ - ".rubocop.yml"
152
+ - ".ruby-version"
153
+ - ".travis.yml"
154
+ - Gemfile
155
+ - Gemfile.lock
156
+ - LICENSE.txt
157
+ - README.md
158
+ - Rakefile
159
+ - TOTAL.txt
160
+ - bin/console
161
+ - bin/setup
162
+ - lib/mwcrawler.rb
163
+ - lib/mwcrawler/classes.rb
164
+ - lib/mwcrawler/courses.rb
165
+ - lib/mwcrawler/crawler.rb
166
+ - lib/mwcrawler/curriculum.rb
167
+ - lib/mwcrawler/departments.rb
168
+ - lib/mwcrawler/helpers.rb
169
+ - lib/mwcrawler/subjects.rb
170
+ - lib/mwcrawler/version.rb
171
+ - mwcrawler.gemspec
172
+ homepage: https://github.com/danilodelyima/mwcrawler
173
+ licenses:
174
+ - MIT
175
+ metadata: {}
176
+ post_install_message:
177
+ rdoc_options: []
178
+ require_paths:
179
+ - lib
180
+ required_ruby_version: !ruby/object:Gem::Requirement
181
+ requirements:
182
+ - - ">="
183
+ - !ruby/object:Gem::Version
184
+ version: '0'
185
+ required_rubygems_version: !ruby/object:Gem::Requirement
186
+ requirements:
187
+ - - ">="
188
+ - !ruby/object:Gem::Version
189
+ version: '0'
190
+ requirements: []
191
+ rubyforge_project:
192
+ rubygems_version: 2.7.6
193
+ signing_key:
194
+ specification_version: 4
195
+ summary: Gema para webscrapping do sistemas de matriculas da unb Matricula Web.
196
+ test_files: []