mwcrawler 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 535ef5f765febcd8e34046c5913006148e354e4c9d4735709ad4bba9fc30b058
4
+ data.tar.gz: f8014655d47a46e88bfeeccca72557341701685402182e6b0b878abfd362e534
5
+ SHA512:
6
+ metadata.gz: 9a2855da03d0feaa6386112e4ede645a5638a2f1d68628f115012220c81d4672959c358bd496853e1a10ce473a493e29cb0f2012803f4c484677ebd4df36b738
7
+ data.tar.gz: 34b27e7cbd83e191fb0065ba80b1ee62adcf7086a77e6e690e3fc426d958584420ed7dad110d18a55b402a1fba5408b3895399e7b0c38745332c9dc06fcd2594
@@ -0,0 +1,14 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /_yardoc/
4
+ /coverage/
5
+ /doc/
6
+ /pkg/
7
+ /spec/reports/
8
+ /tmp/
9
+ /.vscode
10
+ *.gem
11
+
12
+ # rspec failure tracking
13
+ .rspec_status
14
+ *.json
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --color
2
+ --require spec_helper
@@ -0,0 +1,11 @@
1
+ AllCops:
2
+ Exclude:
3
+ - vendor/bundle/**/*
4
+ - '*.gemspec'
5
+
6
+ Metrics/LineLength:
7
+ Max: 120
8
+
9
+ Metrics/BlockLength:
10
+ Exclude:
11
+ - 'spec/**/*'
@@ -0,0 +1 @@
1
+ ruby-2.5.1
@@ -0,0 +1,11 @@
1
+ os: linux
2
+ rvm: 2.5.1
3
+ dist: trusty
4
+ sudo: false
5
+ cache: bundler
6
+ before_install:
7
+ - gem install bundler -v 1.16.1 --no-document
8
+ - bundle install
9
+ script:
10
+ - bundle exec rubocop
11
+ - bundle exec rspec
data/Gemfile ADDED
@@ -0,0 +1,8 @@
1
+ # frozen_string_literal: true
2
+
3
+ source 'https://rubygems.org'
4
+
5
+ git_source(:github) { |repo_name| "https://github.com/#{repo_name}" }
6
+
7
+ # Specify your gem's dependencies in mwcrawler.gemspec
8
+ gemspec
@@ -0,0 +1,95 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ mwcrawler (0.1.0)
5
+ nokogiri (~> 1.8)
6
+
7
+ GEM
8
+ remote: https://rubygems.org/
9
+ specs:
10
+ addressable (2.5.2)
11
+ public_suffix (>= 2.0.2, < 4.0)
12
+ ast (2.4.0)
13
+ coderay (1.1.2)
14
+ coveralls (0.8.22)
15
+ json (>= 1.8, < 3)
16
+ simplecov (~> 0.16.1)
17
+ term-ansicolor (~> 1.3)
18
+ thor (~> 0.19.4)
19
+ tins (~> 1.6)
20
+ crack (0.4.3)
21
+ safe_yaml (~> 1.0.0)
22
+ diff-lcs (1.3)
23
+ docile (1.3.1)
24
+ hashdiff (0.3.7)
25
+ jaro_winkler (1.5.1)
26
+ json (2.1.0)
27
+ method_source (0.9.0)
28
+ mini_portile2 (2.4.0)
29
+ nokogiri (1.10.9)
30
+ mini_portile2 (~> 2.4.0)
31
+ parallel (1.12.1)
32
+ parser (2.5.1.2)
33
+ ast (~> 2.4.0)
34
+ powerpack (0.1.2)
35
+ pry (0.11.3)
36
+ coderay (~> 1.1.0)
37
+ method_source (~> 0.9.0)
38
+ public_suffix (3.0.3)
39
+ rainbow (3.0.0)
40
+ rake (13.0.1)
41
+ rspec (3.8.0)
42
+ rspec-core (~> 3.8.0)
43
+ rspec-expectations (~> 3.8.0)
44
+ rspec-mocks (~> 3.8.0)
45
+ rspec-core (3.8.0)
46
+ rspec-support (~> 3.8.0)
47
+ rspec-expectations (3.8.1)
48
+ diff-lcs (>= 1.2.0, < 2.0)
49
+ rspec-support (~> 3.8.0)
50
+ rspec-mocks (3.8.0)
51
+ diff-lcs (>= 1.2.0, < 2.0)
52
+ rspec-support (~> 3.8.0)
53
+ rspec-support (3.8.0)
54
+ rubocop (0.59.2)
55
+ jaro_winkler (~> 1.5.1)
56
+ parallel (~> 1.10)
57
+ parser (>= 2.5, != 2.5.1.1)
58
+ powerpack (~> 0.1)
59
+ rainbow (>= 2.2.2, < 4.0)
60
+ ruby-progressbar (~> 1.7)
61
+ unicode-display_width (~> 1.0, >= 1.0.1)
62
+ ruby-progressbar (1.10.0)
63
+ safe_yaml (1.0.4)
64
+ simplecov (0.16.1)
65
+ docile (~> 1.1)
66
+ json (>= 1.8, < 3)
67
+ simplecov-html (~> 0.10.0)
68
+ simplecov-html (0.10.2)
69
+ term-ansicolor (1.7.0)
70
+ tins (~> 1.0)
71
+ thor (0.19.4)
72
+ tins (1.18.0)
73
+ unicode-display_width (1.4.0)
74
+ vcr (4.0.0)
75
+ webmock (3.4.2)
76
+ addressable (>= 2.3.6)
77
+ crack (>= 0.3.2)
78
+ hashdiff
79
+
80
+ PLATFORMS
81
+ ruby
82
+
83
+ DEPENDENCIES
84
+ bundler (~> 1.16)
85
+ coveralls
86
+ mwcrawler!
87
+ pry (~> 0.11)
88
+ rake (~> 13.0)
89
+ rspec (~> 3.0)
90
+ rubocop (~> 0.59.2)
91
+ vcr (~> 4.0)
92
+ webmock (~> 3.4)
93
+
94
+ BUNDLED WITH
95
+ 1.16.6
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2018 vitor pontes
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
@@ -0,0 +1,86 @@
1
+ # Mwcrawler
2
+
3
+ Mwcrawler is a gem for parsing UnB's Matricula Web data into consumable hashes.
4
+
5
+ [![Build Status](https://travis-ci.com/danilodelyima/mwcrawler.svg?branch=master)](https://travis-ci.com/danilodelyima/mwcrawler)
6
+ [![Coverage Status](https://coveralls.io/repos/github/danilodelyima/mwcrawler/badge.svg?branch=master)](https://coveralls.io/github/danilodelyima/mwcrawler?branch=master)
7
+
8
+ ## Installation
9
+
10
+ Add this line to your application's Gemfile:
11
+
12
+ ```ruby
13
+ gem 'mwcrawler'
14
+ ```
15
+
16
+ And then execute:
17
+
18
+ bundle
19
+
20
+ Or install it yourself as:
21
+
22
+ gem install mwcrawler
23
+
24
+ ## Usage
25
+
26
+ First instantiate a new crawler `crawler = Mwcrawler::Crawler.new` then you can crawl like so:
27
+
28
+ ```ruby
29
+ courses_hash = crawler.courses
30
+ # return example
31
+ [{"type"=>"Presencial",
32
+ "code"=>"19",
33
+ "name"=>"ADMINISTRAÇÃO",
34
+ "shift"=>"Diurno",
35
+ "curriculums"=>
36
+ [{"name"=>"Administração",
37
+ "degree"=>"Bacharel",
38
+ "semester_max"=>"8",
39
+ "semester_min"=>"16",
40
+ "credits"=>"200"}]},
41
+ {"type"=>"Presencial",
42
+ "code"=>"701",
43
+ "name"=>"ADMINISTRAÇÃO",
44
+ "shift"=>"Noturno",
45
+ "curriculums"=>
46
+ [{"name"=>"Administração",
47
+ "degree"=>"Bacharel",
48
+ "semester_max"=>"8",
49
+ "semester_min"=>"16",
50
+ "credits"=>"200"}]}
51
+ ]
52
+ ```
53
+
54
+ The crawled campus by default is `:darcy_ribeiro` campus,
55
+ but you can specify another `crawler.classes(:planaltina)`.
56
+
57
+ The available resources are:
58
+
59
+ - `classes`
60
+ - `courses`
61
+ - `departments`
62
+ - `curriculum`
63
+
64
+ While `classes` and `curriculum` take `course_code` as param for crawling, `courses` and `departments` take as params any of the four campuses `:darcy_ribeiro`, `:planaltina`, `:ceilandia` and `:gama`.
65
+
66
+ The utility method `semester` returns the current semester.
67
+
68
+ ## Development
69
+
70
+ After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
71
+
72
+ To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
73
+
74
+ ## Contributing
75
+
76
+ Bug reports and pull requests are welcome on GitHub at https://github.com/danilodelyima/mwcrawler.
77
+
78
+ # Guidelines
79
+
80
+ When developing new features the interface must reflect how much scrapping is necessary. In other
81
+ words, if many pages are crawled the user must call many methods. This way we don't overload method
82
+ with functionalities and the user developer can grasp more easily the cost of scrapping that info.
83
+
84
+ ## License
85
+
86
+ The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
@@ -0,0 +1,8 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'bundler/gem_tasks'
4
+ require 'rspec/core/rake_task'
5
+
6
+ RSpec::Core::RakeTask.new(:spec)
7
+
8
+ task default: :spec
@@ -0,0 +1,14 @@
1
+ 2017-2
2
+
3
+ Total de turmas do Darcy: 7366
4
+ Total de turmas Planaltina: 284
5
+ Total de turmas Ceilândia: 526
6
+ Total de turmas Gama: 480
7
+
8
+
9
+ 2018-1
10
+
11
+ Total de turmas do Darcy: 9779 -- 34 min
12
+ Total de turmas Planaltina: 288 -- 2 min
13
+ Total de turmas Ceilândia: 543 -- 3 min
14
+ Total de turmas Gama: 298 -- 2 min
@@ -0,0 +1,12 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ require 'bundler/setup'
5
+ require 'mwcrawler'
6
+
7
+ # You can add fixtures and/or initialization code here to make experimenting
8
+ # with your gem easier. You can also use a different console, if you like.
9
+
10
+ # (If you use this, don't forget to add pry to your Gemfile!)
11
+ require 'pry'
12
+ Pry.start
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
@@ -0,0 +1,19 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'mwcrawler/version'
4
+ require 'nokogiri'
5
+ require 'pry'
6
+ require 'open-uri'
7
+ require 'json'
8
+
9
+ require 'mwcrawler/classes'
10
+ require 'mwcrawler/courses'
11
+ require 'mwcrawler/departments'
12
+ require 'mwcrawler/subjects'
13
+ require 'mwcrawler/helpers'
14
+ require 'mwcrawler/crawler'
15
+
16
+ module Mwcrawler
17
+ # DOMINIO
18
+ SITE = 'https://matriculaweb.unb.br/'
19
+ end
@@ -0,0 +1,92 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Mwcrawler
4
+ # Scraps Classes by campus
5
+ module Classes
6
+ def self.scrap(department_code)
7
+ courses_links = scrap_courses_links(department_code)
8
+ rows = []
9
+ courses_links.each do |course_link|
10
+ rows += scrap_classes(course_link)
11
+ end
12
+ rows
13
+ end
14
+
15
+ private_class_method def self.scrap_courses_links(department_code)
16
+ page = Helpers.set_crawler(department_code, 'graduacao/oferta_dis.aspx?cod=', exact: true)
17
+ page.css('#datatable tr td:nth-child(2) a')
18
+ .map { |link| link['href'] }
19
+ end
20
+
21
+ private_class_method def self.scrap_classes(course_link)
22
+ rows = []
23
+
24
+ page = Helpers.set_crawler(course_link, 'graduacao/', exact: true)
25
+ page_classes = page.css('.tabela-oferta .turma').map(&:text)
26
+
27
+ page_classes.each_with_index do |cl, i|
28
+ row_init = class_row_init(page, cl)
29
+ rows << scrap_row(row_init, page, i)
30
+ Helpers.log "Total de turmas: #{rows.size}"
31
+ end
32
+ rows
33
+ end
34
+ private_class_method def self.class_row_init(page, name)
35
+ { department: page.css('#datatable tr:first-child a').text,
36
+ code: page.css('#datatable')[0].css('tr:nth-child(2) td').text.to_i,
37
+ course_code: scrap_course_code(page),
38
+ credits: scrap_credit_hash(page),
39
+ name: name }
40
+ end
41
+
42
+ private_class_method def self.scrap_course_code(page)
43
+ course_uri = page.css('#datatable')[0].css('tr:nth-child(3) td a').first['href']
44
+ Helpers.uri_query_params(course_uri)['cod'].to_i
45
+ end
46
+
47
+ private_class_method def self.scrap_credit_hash(page)
48
+ credit_string = page.css('#datatable')[0].css('tr:nth-child(4) td').text
49
+ credits = credit_string.split('-').map(&:to_i)
50
+ { theory: credits[0], practical: credits[1], extension: credits[2], study: credits[3] }
51
+ end
52
+
53
+ private_class_method def self.scrap_row(row_init, page, count)
54
+ row = row_init
55
+ row.merge(scrap_vacancies(page, count))
56
+ # HORARIOS
57
+ row[:schedules] = scrap_schedules(page, count)
58
+ # PROFESSORES
59
+ row[:teachers] = scrap_teachers(page, count)
60
+ row
61
+ end
62
+
63
+ private_class_method def self.scrap_schedules(page, count)
64
+ schedules = page.css('.tabela-oferta')[count]
65
+ .css('tr td:nth-child(4) .table')
66
+ .css('td').map(&:text)
67
+
68
+ Helpers.format_hours(schedules)
69
+ end
70
+
71
+ private_class_method def self.scrap_teachers(page, count)
72
+ teachers = page.css('.tabela-oferta')[count]
73
+ .css('tr td:nth-child(5) td')
74
+ .map(&:text)
75
+
76
+ Helpers.format_teachers(teachers)
77
+ end
78
+
79
+ private_class_method def self.scrap_vacancies(page, count)
80
+ {
81
+ vacancies_total: scrap_vacancy(1, page, count),
82
+ vacancies_occupied: scrap_vacancy(2, page, count),
83
+ vacancies_free: scrap_vacancy(3, page, count)
84
+ }
85
+ end
86
+
87
+ private_class_method def self.scrap_vacancy(vacancy_row, page, count)
88
+ page.css('.tabela-oferta')[count]
89
+ .css(".tabela-vagas tr:nth-child(#{vacancy_row}) td:nth-child(3)").text
90
+ end
91
+ end
92
+ end
@@ -0,0 +1,30 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'mwcrawler/curriculum'
4
+
5
+ module Mwcrawler
6
+ # Scraps Courses by campus
7
+ module Courses
8
+ def self.scrap(campus)
9
+ page = Helpers.set_crawler(campus, 'graduacao/curso_rel.aspx?cod=')
10
+ courses = page.css('#datatable tr td').map(&:text)
11
+
12
+ # CADA CURSO SERA UMA LINHA, ENTAO rows E O CONJUNTO DE TODAS AS TURMAS
13
+ rows = []
14
+ rows << scrap_row(courses) until courses.empty?
15
+ Helpers.log "Total de cursos: #{rows.count}"
16
+
17
+ rows
18
+ end
19
+
20
+ private_class_method def self.scrap_row(courses)
21
+ row = {}
22
+ row['type'] = courses.shift
23
+ row['code'] = courses.shift
24
+ row['name'] = courses.shift
25
+ row['shift'] = courses.shift
26
+ row['curriculums'] = Curriculum.scrap(row['code'])
27
+ row
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,31 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Mwcrawler
4
+ # Main api for crawling
5
+ class Crawler
6
+ include Mwcrawler
7
+
8
+ SCRAPPERS = {
9
+ courses: Courses,
10
+ classes: Classes,
11
+ departments: Departments
12
+ }.freeze
13
+
14
+ SCRAPPERS.keys.each do |method|
15
+ define_method(method) do |campus = :darcy_ribeiro, options = { log: false }|
16
+ Options.init(options)
17
+ SCRAPPERS[method].scrap campus
18
+ end
19
+ end
20
+
21
+ def subjects(department, options = { log: false })
22
+ Options.init(options)
23
+ Subjects.scrap department, options
24
+ end
25
+
26
+ def semester
27
+ page = Helpers.set_crawler(nil, 'graduacao/default.aspx', exact: true)
28
+ page.css("a[title='Período Atual'] span").first.text
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,26 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Mwcrawler
4
+ # Scraps curriculums by course code
5
+ module Curriculum
6
+ def self.scrap(code)
7
+ rows = []
8
+ page = Helpers.set_crawler(code, 'graduacao/curso_dados.aspx?cod=', exact: true)
9
+ curriculums = page.css('.table-responsive h4').map { |item| item.children[0].text }
10
+ page.css('.table-responsive .table').each do |table|
11
+ rows << scrap_row(curriculums.shift, table)
12
+ end
13
+ rows
14
+ end
15
+
16
+ private_class_method def self.scrap_row(curriculum_name, table)
17
+ row = {}
18
+ row['name'] = curriculum_name
19
+ row['degree'] = table.css('tr:first td').text
20
+ row['semester_max'] = table.css('tr:nth-child(2) td').text
21
+ row['semester_min'] = table.css('tr:nth-child(3) td').text
22
+ row['credits'] = table.css('tr:nth-child(4) td').text
23
+ row
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,24 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Mwcrawler
4
+ # Scraps Departments by campus
5
+ module Departments
6
+ def self.scrap(campus)
7
+ page = Helpers.set_crawler(campus, 'graduacao/oferta_dep.aspx?cod=')
8
+ departments = page.css('#datatable tr td').map(&:text)
9
+
10
+ # CADA DEPARTAMENTO SERA UMA LINHA, ENTAO rows E O CONJUNTO DE TODOS OS DEPARTAMENTOS
11
+ rows = []
12
+ rows << scrap_row(departments) until departments.empty?
13
+ rows
14
+ end
15
+
16
+ private_class_method def self.scrap_row(departments)
17
+ row = {}
18
+ row['code'] = departments.shift
19
+ row['acronym'] = departments.shift
20
+ row['name'] = departments.shift
21
+ row
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,76 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Mwcrawler
4
+ # Controls available campuses
5
+ class Campuses
6
+ CAMPUSES = {
7
+ darcy_ribeiro: 1,
8
+ planaltina: 2,
9
+ ceilandia: 3,
10
+ gama: 4
11
+ }.freeze
12
+
13
+ def self.id(campus)
14
+ raise ArgumentError, "Campus: #{campus} not in: #{CAMPUSES.keys}" unless CAMPUSES.include? campus
15
+
16
+ CAMPUSES[campus]
17
+ end
18
+ end
19
+
20
+ # Options module
21
+ module Options
22
+ module_function
23
+
24
+ @log = false
25
+
26
+ def init(options = { log: false })
27
+ @log = options[:log].freeze
28
+ end
29
+
30
+ def log_enabled?
31
+ @log
32
+ end
33
+ end
34
+
35
+ # Helper methods used throughout the lib
36
+ class Helpers
37
+ def self.format_hours(schedules, row = [])
38
+ until schedules.empty?
39
+ schedule = []
40
+ schedule << schedules.shift # DIA
41
+ schedule << schedules.shift # HORARIO DE INICIO
42
+ schedule << schedules.shift # HORARIO DE FIM
43
+ schedules.shift # RETIRANDO LIXO
44
+ schedule << schedules.shift # LOCAL DA AULA
45
+ row << schedule
46
+ end
47
+ row
48
+ end
49
+
50
+ def self.format_teachers(teachers)
51
+ teachers.empty? ? ['A Designar'] : teachers
52
+ end
53
+
54
+ # MODE: TURMAS, CURSOS OU CURRICULO
55
+ def self.set_crawler(id, search_mode, options = { exact: false })
56
+ id = Campuses.id id unless options[:exact]
57
+ url = SITE + search_mode + id.to_s
58
+ Nokogiri::HTML(URI.parse(url).open)
59
+ end
60
+
61
+ def self.write_json(file_name, object)
62
+ File.open(file_name, 'w+') do |f|
63
+ f.write object.to_json
64
+ end
65
+ end
66
+
67
+ def self.log(msg)
68
+ puts msg if Options.log_enabled?
69
+ end
70
+
71
+ def self.uri_query_params(uri)
72
+ query_string = URI.parse(uri).query
73
+ query_string.split('&').map { |param| param.split('=') }.to_h
74
+ end
75
+ end
76
+ end
@@ -0,0 +1,49 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Mwcrawler
4
+ # Scraps Subjects by department
5
+ module Subjects
6
+ def self.scrap(department_or_id, options)
7
+ if options[:by_id]
8
+ subject_by_id(department_or_id)
9
+ elsif options[:by_department]
10
+ subject_by_department(department_or_id)
11
+ else
12
+ raise ArgumentError, 'second argument not specified. You can find a subject by department code or id'
13
+ end
14
+ end
15
+
16
+ private_class_method def self.subject_by_department(department)
17
+ page = Helpers.set_crawler(department, 'graduacao/oferta_dis.aspx?cod=', exact: true)
18
+ scrap_row(department, page)
19
+ end
20
+
21
+ private_class_method def self.subject_by_id(id)
22
+ page = Helpers.set_crawler(id, 'graduacao/oferta_dados.aspx?cod=', exact: true)
23
+ row_init_by_id(page)
24
+ end
25
+
26
+ private_class_method def self.row_init_by_id(page)
27
+ { code: page.css('#datatable')[0].css('tr:nth-child(2) td').text.to_i,
28
+ name: page.css('#datatable')[0].css('tr:nth-child(3) td').text,
29
+ department: page.css('#datatable tr:first-child a').first['href'].scan(/\d+/)[0].to_i,
30
+ level: 'graduação' }
31
+ end
32
+
33
+ private_class_method def self.scrap_row(dep_code, page)
34
+ subjects = []
35
+ length = page.css('#datatable tr td:nth-child(1)').count
36
+ length.times do |i|
37
+ subjects << row_init_by_department(page, dep_code, i)
38
+ end
39
+ subjects
40
+ end
41
+
42
+ private_class_method def self.row_init_by_department(page, dep_code, index)
43
+ { code: page.css('#datatable tr td:nth-child(1)').map(&:text)[index].to_i,
44
+ name: page.css('#datatable tr td:nth-child(2)').map(&:text)[index],
45
+ department: dep_code.to_i,
46
+ level: 'graduação' }
47
+ end
48
+ end
49
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Mwcrawler
4
+ VERSION = '0.1.0'
5
+ end
@@ -0,0 +1,41 @@
1
+ lib = File.expand_path('lib', __dir__)
2
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
3
+ require 'mwcrawler/version'
4
+
5
+ Gem::Specification.new do |spec|
6
+ spec.name = 'mwcrawler'
7
+ spec.version = Mwcrawler::VERSION
8
+ spec.authors = ['Danilo de Lima', 'vitor pontes']
9
+ spec.email = ['vitormax2005@hotmail.com']
10
+
11
+ spec.summary = 'Gema para webscrapping do sistemas de matriculas da unb Matricula Web.'
12
+ spec.description = 'Essa gema provê uma api ruby para se fazer o scrapping de páginas html do sistema matricula web e retornar um conteudo que pode ser mais facilmente processado pelo programa'
13
+ spec.homepage = 'https://github.com/danilodelyima/mwcrawler'
14
+ spec.license = 'MIT'
15
+
16
+ # Prevent pushing this gem to RubyGems.org. To allow pushes either set the 'allowed_push_host'
17
+ # to allow pushing to a single host or delete this section to allow pushing to any host.
18
+ # if spec.respond_to?(:metadata)
19
+ # spec.metadata['allowed_push_host'] = "TODO: Set to 'http://mygemserver.com'"
20
+ # else
21
+ # raise 'RubyGems 2.0 or newer is required to protect against ' \
22
+ # 'public gem pushes.'
23
+ # end
24
+
25
+ spec.files = `git ls-files -z`.split("\x0").reject do |f|
26
+ f.match(%r{^(test|spec|features)/})
27
+ end
28
+ spec.bindir = 'exe'
29
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
30
+ spec.require_paths = ['lib']
31
+
32
+ spec.add_development_dependency 'bundler', '~> 1.16'
33
+ spec.add_development_dependency 'coveralls'
34
+ spec.add_development_dependency 'pry', '~> 0.11'
35
+ spec.add_development_dependency 'rake', '~> 13.0'
36
+ spec.add_development_dependency 'rspec', '~> 3.0'
37
+ spec.add_development_dependency 'vcr', '~> 4.0'
38
+ spec.add_development_dependency 'webmock', '~> 3.4'
39
+ spec.add_development_dependency 'rubocop', '~> 0.59.2'
40
+ spec.add_dependency 'nokogiri', '~> 1.8'
41
+ end
metadata ADDED
@@ -0,0 +1,196 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: mwcrawler
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Danilo de Lima
8
+ - vitor pontes
9
+ autorequire:
10
+ bindir: exe
11
+ cert_chain: []
12
+ date: 2020-05-29 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: bundler
16
+ requirement: !ruby/object:Gem::Requirement
17
+ requirements:
18
+ - - "~>"
19
+ - !ruby/object:Gem::Version
20
+ version: '1.16'
21
+ type: :development
22
+ prerelease: false
23
+ version_requirements: !ruby/object:Gem::Requirement
24
+ requirements:
25
+ - - "~>"
26
+ - !ruby/object:Gem::Version
27
+ version: '1.16'
28
+ - !ruby/object:Gem::Dependency
29
+ name: coveralls
30
+ requirement: !ruby/object:Gem::Requirement
31
+ requirements:
32
+ - - ">="
33
+ - !ruby/object:Gem::Version
34
+ version: '0'
35
+ type: :development
36
+ prerelease: false
37
+ version_requirements: !ruby/object:Gem::Requirement
38
+ requirements:
39
+ - - ">="
40
+ - !ruby/object:Gem::Version
41
+ version: '0'
42
+ - !ruby/object:Gem::Dependency
43
+ name: pry
44
+ requirement: !ruby/object:Gem::Requirement
45
+ requirements:
46
+ - - "~>"
47
+ - !ruby/object:Gem::Version
48
+ version: '0.11'
49
+ type: :development
50
+ prerelease: false
51
+ version_requirements: !ruby/object:Gem::Requirement
52
+ requirements:
53
+ - - "~>"
54
+ - !ruby/object:Gem::Version
55
+ version: '0.11'
56
+ - !ruby/object:Gem::Dependency
57
+ name: rake
58
+ requirement: !ruby/object:Gem::Requirement
59
+ requirements:
60
+ - - "~>"
61
+ - !ruby/object:Gem::Version
62
+ version: '13.0'
63
+ type: :development
64
+ prerelease: false
65
+ version_requirements: !ruby/object:Gem::Requirement
66
+ requirements:
67
+ - - "~>"
68
+ - !ruby/object:Gem::Version
69
+ version: '13.0'
70
+ - !ruby/object:Gem::Dependency
71
+ name: rspec
72
+ requirement: !ruby/object:Gem::Requirement
73
+ requirements:
74
+ - - "~>"
75
+ - !ruby/object:Gem::Version
76
+ version: '3.0'
77
+ type: :development
78
+ prerelease: false
79
+ version_requirements: !ruby/object:Gem::Requirement
80
+ requirements:
81
+ - - "~>"
82
+ - !ruby/object:Gem::Version
83
+ version: '3.0'
84
+ - !ruby/object:Gem::Dependency
85
+ name: vcr
86
+ requirement: !ruby/object:Gem::Requirement
87
+ requirements:
88
+ - - "~>"
89
+ - !ruby/object:Gem::Version
90
+ version: '4.0'
91
+ type: :development
92
+ prerelease: false
93
+ version_requirements: !ruby/object:Gem::Requirement
94
+ requirements:
95
+ - - "~>"
96
+ - !ruby/object:Gem::Version
97
+ version: '4.0'
98
+ - !ruby/object:Gem::Dependency
99
+ name: webmock
100
+ requirement: !ruby/object:Gem::Requirement
101
+ requirements:
102
+ - - "~>"
103
+ - !ruby/object:Gem::Version
104
+ version: '3.4'
105
+ type: :development
106
+ prerelease: false
107
+ version_requirements: !ruby/object:Gem::Requirement
108
+ requirements:
109
+ - - "~>"
110
+ - !ruby/object:Gem::Version
111
+ version: '3.4'
112
+ - !ruby/object:Gem::Dependency
113
+ name: rubocop
114
+ requirement: !ruby/object:Gem::Requirement
115
+ requirements:
116
+ - - "~>"
117
+ - !ruby/object:Gem::Version
118
+ version: 0.59.2
119
+ type: :development
120
+ prerelease: false
121
+ version_requirements: !ruby/object:Gem::Requirement
122
+ requirements:
123
+ - - "~>"
124
+ - !ruby/object:Gem::Version
125
+ version: 0.59.2
126
+ - !ruby/object:Gem::Dependency
127
+ name: nokogiri
128
+ requirement: !ruby/object:Gem::Requirement
129
+ requirements:
130
+ - - "~>"
131
+ - !ruby/object:Gem::Version
132
+ version: '1.8'
133
+ type: :runtime
134
+ prerelease: false
135
+ version_requirements: !ruby/object:Gem::Requirement
136
+ requirements:
137
+ - - "~>"
138
+ - !ruby/object:Gem::Version
139
+ version: '1.8'
140
+ description: Essa gema provê uma api ruby para se fazer o scrapping de páginas html
141
+ do sistema matricula web e retornar um conteudo que pode ser mais facilmente processado
142
+ pelo programa
143
+ email:
144
+ - vitormax2005@hotmail.com
145
+ executables: []
146
+ extensions: []
147
+ extra_rdoc_files: []
148
+ files:
149
+ - ".gitignore"
150
+ - ".rspec"
151
+ - ".rubocop.yml"
152
+ - ".ruby-version"
153
+ - ".travis.yml"
154
+ - Gemfile
155
+ - Gemfile.lock
156
+ - LICENSE.txt
157
+ - README.md
158
+ - Rakefile
159
+ - TOTAL.txt
160
+ - bin/console
161
+ - bin/setup
162
+ - lib/mwcrawler.rb
163
+ - lib/mwcrawler/classes.rb
164
+ - lib/mwcrawler/courses.rb
165
+ - lib/mwcrawler/crawler.rb
166
+ - lib/mwcrawler/curriculum.rb
167
+ - lib/mwcrawler/departments.rb
168
+ - lib/mwcrawler/helpers.rb
169
+ - lib/mwcrawler/subjects.rb
170
+ - lib/mwcrawler/version.rb
171
+ - mwcrawler.gemspec
172
+ homepage: https://github.com/danilodelyima/mwcrawler
173
+ licenses:
174
+ - MIT
175
+ metadata: {}
176
+ post_install_message:
177
+ rdoc_options: []
178
+ require_paths:
179
+ - lib
180
+ required_ruby_version: !ruby/object:Gem::Requirement
181
+ requirements:
182
+ - - ">="
183
+ - !ruby/object:Gem::Version
184
+ version: '0'
185
+ required_rubygems_version: !ruby/object:Gem::Requirement
186
+ requirements:
187
+ - - ">="
188
+ - !ruby/object:Gem::Version
189
+ version: '0'
190
+ requirements: []
191
+ rubyforge_project:
192
+ rubygems_version: 2.7.6
193
+ signing_key:
194
+ specification_version: 4
195
+ summary: Gema para webscrapping do sistemas de matriculas da unb Matricula Web.
196
+ test_files: []