mwcrawler 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +14 -0
- data/.rspec +2 -0
- data/.rubocop.yml +11 -0
- data/.ruby-version +1 -0
- data/.travis.yml +11 -0
- data/Gemfile +8 -0
- data/Gemfile.lock +95 -0
- data/LICENSE.txt +21 -0
- data/README.md +86 -0
- data/Rakefile +8 -0
- data/TOTAL.txt +14 -0
- data/bin/console +12 -0
- data/bin/setup +8 -0
- data/lib/mwcrawler.rb +19 -0
- data/lib/mwcrawler/classes.rb +92 -0
- data/lib/mwcrawler/courses.rb +30 -0
- data/lib/mwcrawler/crawler.rb +31 -0
- data/lib/mwcrawler/curriculum.rb +26 -0
- data/lib/mwcrawler/departments.rb +24 -0
- data/lib/mwcrawler/helpers.rb +76 -0
- data/lib/mwcrawler/subjects.rb +49 -0
- data/lib/mwcrawler/version.rb +5 -0
- data/mwcrawler.gemspec +41 -0
- metadata +196 -0
checksums.yaml
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
---
|
|
2
|
+
SHA256:
|
|
3
|
+
metadata.gz: 535ef5f765febcd8e34046c5913006148e354e4c9d4735709ad4bba9fc30b058
|
|
4
|
+
data.tar.gz: f8014655d47a46e88bfeeccca72557341701685402182e6b0b878abfd362e534
|
|
5
|
+
SHA512:
|
|
6
|
+
metadata.gz: 9a2855da03d0feaa6386112e4ede645a5638a2f1d68628f115012220c81d4672959c358bd496853e1a10ce473a493e29cb0f2012803f4c484677ebd4df36b738
|
|
7
|
+
data.tar.gz: 34b27e7cbd83e191fb0065ba80b1ee62adcf7086a77e6e690e3fc426d958584420ed7dad110d18a55b402a1fba5408b3895399e7b0c38745332c9dc06fcd2594
|
data/.gitignore
ADDED
data/.rspec
ADDED
data/.rubocop.yml
ADDED
data/.ruby-version
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
ruby-2.5.1
|
data/.travis.yml
ADDED
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
PATH
|
|
2
|
+
remote: .
|
|
3
|
+
specs:
|
|
4
|
+
mwcrawler (0.1.0)
|
|
5
|
+
nokogiri (~> 1.8)
|
|
6
|
+
|
|
7
|
+
GEM
|
|
8
|
+
remote: https://rubygems.org/
|
|
9
|
+
specs:
|
|
10
|
+
addressable (2.5.2)
|
|
11
|
+
public_suffix (>= 2.0.2, < 4.0)
|
|
12
|
+
ast (2.4.0)
|
|
13
|
+
coderay (1.1.2)
|
|
14
|
+
coveralls (0.8.22)
|
|
15
|
+
json (>= 1.8, < 3)
|
|
16
|
+
simplecov (~> 0.16.1)
|
|
17
|
+
term-ansicolor (~> 1.3)
|
|
18
|
+
thor (~> 0.19.4)
|
|
19
|
+
tins (~> 1.6)
|
|
20
|
+
crack (0.4.3)
|
|
21
|
+
safe_yaml (~> 1.0.0)
|
|
22
|
+
diff-lcs (1.3)
|
|
23
|
+
docile (1.3.1)
|
|
24
|
+
hashdiff (0.3.7)
|
|
25
|
+
jaro_winkler (1.5.1)
|
|
26
|
+
json (2.1.0)
|
|
27
|
+
method_source (0.9.0)
|
|
28
|
+
mini_portile2 (2.4.0)
|
|
29
|
+
nokogiri (1.10.9)
|
|
30
|
+
mini_portile2 (~> 2.4.0)
|
|
31
|
+
parallel (1.12.1)
|
|
32
|
+
parser (2.5.1.2)
|
|
33
|
+
ast (~> 2.4.0)
|
|
34
|
+
powerpack (0.1.2)
|
|
35
|
+
pry (0.11.3)
|
|
36
|
+
coderay (~> 1.1.0)
|
|
37
|
+
method_source (~> 0.9.0)
|
|
38
|
+
public_suffix (3.0.3)
|
|
39
|
+
rainbow (3.0.0)
|
|
40
|
+
rake (13.0.1)
|
|
41
|
+
rspec (3.8.0)
|
|
42
|
+
rspec-core (~> 3.8.0)
|
|
43
|
+
rspec-expectations (~> 3.8.0)
|
|
44
|
+
rspec-mocks (~> 3.8.0)
|
|
45
|
+
rspec-core (3.8.0)
|
|
46
|
+
rspec-support (~> 3.8.0)
|
|
47
|
+
rspec-expectations (3.8.1)
|
|
48
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
|
49
|
+
rspec-support (~> 3.8.0)
|
|
50
|
+
rspec-mocks (3.8.0)
|
|
51
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
|
52
|
+
rspec-support (~> 3.8.0)
|
|
53
|
+
rspec-support (3.8.0)
|
|
54
|
+
rubocop (0.59.2)
|
|
55
|
+
jaro_winkler (~> 1.5.1)
|
|
56
|
+
parallel (~> 1.10)
|
|
57
|
+
parser (>= 2.5, != 2.5.1.1)
|
|
58
|
+
powerpack (~> 0.1)
|
|
59
|
+
rainbow (>= 2.2.2, < 4.0)
|
|
60
|
+
ruby-progressbar (~> 1.7)
|
|
61
|
+
unicode-display_width (~> 1.0, >= 1.0.1)
|
|
62
|
+
ruby-progressbar (1.10.0)
|
|
63
|
+
safe_yaml (1.0.4)
|
|
64
|
+
simplecov (0.16.1)
|
|
65
|
+
docile (~> 1.1)
|
|
66
|
+
json (>= 1.8, < 3)
|
|
67
|
+
simplecov-html (~> 0.10.0)
|
|
68
|
+
simplecov-html (0.10.2)
|
|
69
|
+
term-ansicolor (1.7.0)
|
|
70
|
+
tins (~> 1.0)
|
|
71
|
+
thor (0.19.4)
|
|
72
|
+
tins (1.18.0)
|
|
73
|
+
unicode-display_width (1.4.0)
|
|
74
|
+
vcr (4.0.0)
|
|
75
|
+
webmock (3.4.2)
|
|
76
|
+
addressable (>= 2.3.6)
|
|
77
|
+
crack (>= 0.3.2)
|
|
78
|
+
hashdiff
|
|
79
|
+
|
|
80
|
+
PLATFORMS
|
|
81
|
+
ruby
|
|
82
|
+
|
|
83
|
+
DEPENDENCIES
|
|
84
|
+
bundler (~> 1.16)
|
|
85
|
+
coveralls
|
|
86
|
+
mwcrawler!
|
|
87
|
+
pry (~> 0.11)
|
|
88
|
+
rake (~> 13.0)
|
|
89
|
+
rspec (~> 3.0)
|
|
90
|
+
rubocop (~> 0.59.2)
|
|
91
|
+
vcr (~> 4.0)
|
|
92
|
+
webmock (~> 3.4)
|
|
93
|
+
|
|
94
|
+
BUNDLED WITH
|
|
95
|
+
1.16.6
|
data/LICENSE.txt
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
The MIT License (MIT)
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2018 vitor pontes
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in
|
|
13
|
+
all copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
21
|
+
THE SOFTWARE.
|
data/README.md
ADDED
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
# Mwcrawler
|
|
2
|
+
|
|
3
|
+
Mwcrawler is a gem for parsing UnB's Matricula Web data into consumable hashes.
|
|
4
|
+
|
|
5
|
+
[](https://travis-ci.com/danilodelyima/mwcrawler)
|
|
6
|
+
[](https://coveralls.io/github/danilodelyima/mwcrawler?branch=master)
|
|
7
|
+
|
|
8
|
+
## Installation
|
|
9
|
+
|
|
10
|
+
Add this line to your application's Gemfile:
|
|
11
|
+
|
|
12
|
+
```ruby
|
|
13
|
+
gem 'mwcrawler'
|
|
14
|
+
```
|
|
15
|
+
|
|
16
|
+
And then execute:
|
|
17
|
+
|
|
18
|
+
bundle
|
|
19
|
+
|
|
20
|
+
Or install it yourself as:
|
|
21
|
+
|
|
22
|
+
gem install mwcrawler
|
|
23
|
+
|
|
24
|
+
## Usage
|
|
25
|
+
|
|
26
|
+
First instantiate a new crawler `crawler = Mwcrawler::Crawler.new` then you can crawl like so:
|
|
27
|
+
|
|
28
|
+
```ruby
|
|
29
|
+
courses_hash = crawler.courses
|
|
30
|
+
# return example
|
|
31
|
+
[{"type"=>"Presencial",
|
|
32
|
+
"code"=>"19",
|
|
33
|
+
"name"=>"ADMINISTRAÇÃO",
|
|
34
|
+
"shift"=>"Diurno",
|
|
35
|
+
"curriculums"=>
|
|
36
|
+
[{"name"=>"Administração",
|
|
37
|
+
"degree"=>"Bacharel",
|
|
38
|
+
"semester_max"=>"8",
|
|
39
|
+
"semester_min"=>"16",
|
|
40
|
+
"credits"=>"200"}]},
|
|
41
|
+
{"type"=>"Presencial",
|
|
42
|
+
"code"=>"701",
|
|
43
|
+
"name"=>"ADMINISTRAÇÃO",
|
|
44
|
+
"shift"=>"Noturno",
|
|
45
|
+
"curriculums"=>
|
|
46
|
+
[{"name"=>"Administração",
|
|
47
|
+
"degree"=>"Bacharel",
|
|
48
|
+
"semester_max"=>"8",
|
|
49
|
+
"semester_min"=>"16",
|
|
50
|
+
"credits"=>"200"}]}
|
|
51
|
+
]
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
The crawled campus by default is `:darcy_ribeiro` campus,
|
|
55
|
+
but you can specify another `crawler.classes(:planaltina)`.
|
|
56
|
+
|
|
57
|
+
The available resources are:
|
|
58
|
+
|
|
59
|
+
- `classes`
|
|
60
|
+
- `courses`
|
|
61
|
+
- `departments`
|
|
62
|
+
- `curriculum`
|
|
63
|
+
|
|
64
|
+
While `classes` and `curriculum` take `course_code` as param for crawling, `courses` and `departments` take as params any of the four campuses `:darcy_ribeiro`, `:planaltina`, `:ceilandia` and `:gama`.
|
|
65
|
+
|
|
66
|
+
The utility method `semester` returns the current semester.
|
|
67
|
+
|
|
68
|
+
## Development
|
|
69
|
+
|
|
70
|
+
After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
|
|
71
|
+
|
|
72
|
+
To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
|
|
73
|
+
|
|
74
|
+
## Contributing
|
|
75
|
+
|
|
76
|
+
Bug reports and pull requests are welcome on GitHub at https://github.com/danilodelyima/mwcrawler.
|
|
77
|
+
|
|
78
|
+
# Guidelines
|
|
79
|
+
|
|
80
|
+
When developing new features the interface must reflect how much scrapping is necessary. In other
|
|
81
|
+
words, if many pages are crawled the user must call many methods. This way we don't overload method
|
|
82
|
+
with functionalities and the user developer can grasp more easily the cost of scrapping that info.
|
|
83
|
+
|
|
84
|
+
## License
|
|
85
|
+
|
|
86
|
+
The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
|
data/Rakefile
ADDED
data/TOTAL.txt
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
2017-2
|
|
2
|
+
|
|
3
|
+
Total de turmas do Darcy: 7366
|
|
4
|
+
Total de turmas Planaltina: 284
|
|
5
|
+
Total de turmas Ceilândia: 526
|
|
6
|
+
Total de turmas Gama: 480
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
2018-1
|
|
10
|
+
|
|
11
|
+
Total de turmas do Darcy: 9779 -- 34 min
|
|
12
|
+
Total de turmas Planaltina: 288 -- 2 min
|
|
13
|
+
Total de turmas Ceilândia: 543 -- 3 min
|
|
14
|
+
Total de turmas Gama: 298 -- 2 min
|
data/bin/console
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
# frozen_string_literal: true
|
|
3
|
+
|
|
4
|
+
require 'bundler/setup'
|
|
5
|
+
require 'mwcrawler'
|
|
6
|
+
|
|
7
|
+
# You can add fixtures and/or initialization code here to make experimenting
|
|
8
|
+
# with your gem easier. You can also use a different console, if you like.
|
|
9
|
+
|
|
10
|
+
# (If you use this, don't forget to add pry to your Gemfile!)
|
|
11
|
+
require 'pry'
|
|
12
|
+
Pry.start
|
data/bin/setup
ADDED
data/lib/mwcrawler.rb
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'mwcrawler/version'
|
|
4
|
+
require 'nokogiri'
|
|
5
|
+
require 'pry'
|
|
6
|
+
require 'open-uri'
|
|
7
|
+
require 'json'
|
|
8
|
+
|
|
9
|
+
require 'mwcrawler/classes'
|
|
10
|
+
require 'mwcrawler/courses'
|
|
11
|
+
require 'mwcrawler/departments'
|
|
12
|
+
require 'mwcrawler/subjects'
|
|
13
|
+
require 'mwcrawler/helpers'
|
|
14
|
+
require 'mwcrawler/crawler'
|
|
15
|
+
|
|
16
|
+
module Mwcrawler
|
|
17
|
+
# DOMINIO
|
|
18
|
+
SITE = 'https://matriculaweb.unb.br/'
|
|
19
|
+
end
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Mwcrawler
|
|
4
|
+
# Scraps Classes by campus
|
|
5
|
+
module Classes
|
|
6
|
+
def self.scrap(department_code)
|
|
7
|
+
courses_links = scrap_courses_links(department_code)
|
|
8
|
+
rows = []
|
|
9
|
+
courses_links.each do |course_link|
|
|
10
|
+
rows += scrap_classes(course_link)
|
|
11
|
+
end
|
|
12
|
+
rows
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
private_class_method def self.scrap_courses_links(department_code)
|
|
16
|
+
page = Helpers.set_crawler(department_code, 'graduacao/oferta_dis.aspx?cod=', exact: true)
|
|
17
|
+
page.css('#datatable tr td:nth-child(2) a')
|
|
18
|
+
.map { |link| link['href'] }
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
private_class_method def self.scrap_classes(course_link)
|
|
22
|
+
rows = []
|
|
23
|
+
|
|
24
|
+
page = Helpers.set_crawler(course_link, 'graduacao/', exact: true)
|
|
25
|
+
page_classes = page.css('.tabela-oferta .turma').map(&:text)
|
|
26
|
+
|
|
27
|
+
page_classes.each_with_index do |cl, i|
|
|
28
|
+
row_init = class_row_init(page, cl)
|
|
29
|
+
rows << scrap_row(row_init, page, i)
|
|
30
|
+
Helpers.log "Total de turmas: #{rows.size}"
|
|
31
|
+
end
|
|
32
|
+
rows
|
|
33
|
+
end
|
|
34
|
+
private_class_method def self.class_row_init(page, name)
|
|
35
|
+
{ department: page.css('#datatable tr:first-child a').text,
|
|
36
|
+
code: page.css('#datatable')[0].css('tr:nth-child(2) td').text.to_i,
|
|
37
|
+
course_code: scrap_course_code(page),
|
|
38
|
+
credits: scrap_credit_hash(page),
|
|
39
|
+
name: name }
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
private_class_method def self.scrap_course_code(page)
|
|
43
|
+
course_uri = page.css('#datatable')[0].css('tr:nth-child(3) td a').first['href']
|
|
44
|
+
Helpers.uri_query_params(course_uri)['cod'].to_i
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
private_class_method def self.scrap_credit_hash(page)
|
|
48
|
+
credit_string = page.css('#datatable')[0].css('tr:nth-child(4) td').text
|
|
49
|
+
credits = credit_string.split('-').map(&:to_i)
|
|
50
|
+
{ theory: credits[0], practical: credits[1], extension: credits[2], study: credits[3] }
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
private_class_method def self.scrap_row(row_init, page, count)
|
|
54
|
+
row = row_init
|
|
55
|
+
row.merge(scrap_vacancies(page, count))
|
|
56
|
+
# HORARIOS
|
|
57
|
+
row[:schedules] = scrap_schedules(page, count)
|
|
58
|
+
# PROFESSORES
|
|
59
|
+
row[:teachers] = scrap_teachers(page, count)
|
|
60
|
+
row
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
private_class_method def self.scrap_schedules(page, count)
|
|
64
|
+
schedules = page.css('.tabela-oferta')[count]
|
|
65
|
+
.css('tr td:nth-child(4) .table')
|
|
66
|
+
.css('td').map(&:text)
|
|
67
|
+
|
|
68
|
+
Helpers.format_hours(schedules)
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
private_class_method def self.scrap_teachers(page, count)
|
|
72
|
+
teachers = page.css('.tabela-oferta')[count]
|
|
73
|
+
.css('tr td:nth-child(5) td')
|
|
74
|
+
.map(&:text)
|
|
75
|
+
|
|
76
|
+
Helpers.format_teachers(teachers)
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
private_class_method def self.scrap_vacancies(page, count)
|
|
80
|
+
{
|
|
81
|
+
vacancies_total: scrap_vacancy(1, page, count),
|
|
82
|
+
vacancies_occupied: scrap_vacancy(2, page, count),
|
|
83
|
+
vacancies_free: scrap_vacancy(3, page, count)
|
|
84
|
+
}
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
private_class_method def self.scrap_vacancy(vacancy_row, page, count)
|
|
88
|
+
page.css('.tabela-oferta')[count]
|
|
89
|
+
.css(".tabela-vagas tr:nth-child(#{vacancy_row}) td:nth-child(3)").text
|
|
90
|
+
end
|
|
91
|
+
end
|
|
92
|
+
end
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'mwcrawler/curriculum'
|
|
4
|
+
|
|
5
|
+
module Mwcrawler
|
|
6
|
+
# Scraps Courses by campus
|
|
7
|
+
module Courses
|
|
8
|
+
def self.scrap(campus)
|
|
9
|
+
page = Helpers.set_crawler(campus, 'graduacao/curso_rel.aspx?cod=')
|
|
10
|
+
courses = page.css('#datatable tr td').map(&:text)
|
|
11
|
+
|
|
12
|
+
# CADA CURSO SERA UMA LINHA, ENTAO rows E O CONJUNTO DE TODAS AS TURMAS
|
|
13
|
+
rows = []
|
|
14
|
+
rows << scrap_row(courses) until courses.empty?
|
|
15
|
+
Helpers.log "Total de cursos: #{rows.count}"
|
|
16
|
+
|
|
17
|
+
rows
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
private_class_method def self.scrap_row(courses)
|
|
21
|
+
row = {}
|
|
22
|
+
row['type'] = courses.shift
|
|
23
|
+
row['code'] = courses.shift
|
|
24
|
+
row['name'] = courses.shift
|
|
25
|
+
row['shift'] = courses.shift
|
|
26
|
+
row['curriculums'] = Curriculum.scrap(row['code'])
|
|
27
|
+
row
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
end
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Mwcrawler
|
|
4
|
+
# Main api for crawling
|
|
5
|
+
class Crawler
|
|
6
|
+
include Mwcrawler
|
|
7
|
+
|
|
8
|
+
SCRAPPERS = {
|
|
9
|
+
courses: Courses,
|
|
10
|
+
classes: Classes,
|
|
11
|
+
departments: Departments
|
|
12
|
+
}.freeze
|
|
13
|
+
|
|
14
|
+
SCRAPPERS.keys.each do |method|
|
|
15
|
+
define_method(method) do |campus = :darcy_ribeiro, options = { log: false }|
|
|
16
|
+
Options.init(options)
|
|
17
|
+
SCRAPPERS[method].scrap campus
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
def subjects(department, options = { log: false })
|
|
22
|
+
Options.init(options)
|
|
23
|
+
Subjects.scrap department, options
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
def semester
|
|
27
|
+
page = Helpers.set_crawler(nil, 'graduacao/default.aspx', exact: true)
|
|
28
|
+
page.css("a[title='Período Atual'] span").first.text
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
end
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Mwcrawler
|
|
4
|
+
# Scraps curriculums by course code
|
|
5
|
+
module Curriculum
|
|
6
|
+
def self.scrap(code)
|
|
7
|
+
rows = []
|
|
8
|
+
page = Helpers.set_crawler(code, 'graduacao/curso_dados.aspx?cod=', exact: true)
|
|
9
|
+
curriculums = page.css('.table-responsive h4').map { |item| item.children[0].text }
|
|
10
|
+
page.css('.table-responsive .table').each do |table|
|
|
11
|
+
rows << scrap_row(curriculums.shift, table)
|
|
12
|
+
end
|
|
13
|
+
rows
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
private_class_method def self.scrap_row(curriculum_name, table)
|
|
17
|
+
row = {}
|
|
18
|
+
row['name'] = curriculum_name
|
|
19
|
+
row['degree'] = table.css('tr:first td').text
|
|
20
|
+
row['semester_max'] = table.css('tr:nth-child(2) td').text
|
|
21
|
+
row['semester_min'] = table.css('tr:nth-child(3) td').text
|
|
22
|
+
row['credits'] = table.css('tr:nth-child(4) td').text
|
|
23
|
+
row
|
|
24
|
+
end
|
|
25
|
+
end
|
|
26
|
+
end
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Mwcrawler
|
|
4
|
+
# Scraps Departments by campus
|
|
5
|
+
module Departments
|
|
6
|
+
def self.scrap(campus)
|
|
7
|
+
page = Helpers.set_crawler(campus, 'graduacao/oferta_dep.aspx?cod=')
|
|
8
|
+
departments = page.css('#datatable tr td').map(&:text)
|
|
9
|
+
|
|
10
|
+
# CADA DEPARTAMENTO SERA UMA LINHA, ENTAO rows E O CONJUNTO DE TODOS OS DEPARTAMENTOS
|
|
11
|
+
rows = []
|
|
12
|
+
rows << scrap_row(departments) until departments.empty?
|
|
13
|
+
rows
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
private_class_method def self.scrap_row(departments)
|
|
17
|
+
row = {}
|
|
18
|
+
row['code'] = departments.shift
|
|
19
|
+
row['acronym'] = departments.shift
|
|
20
|
+
row['name'] = departments.shift
|
|
21
|
+
row
|
|
22
|
+
end
|
|
23
|
+
end
|
|
24
|
+
end
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Mwcrawler
|
|
4
|
+
# Controls available campuses
|
|
5
|
+
class Campuses
|
|
6
|
+
CAMPUSES = {
|
|
7
|
+
darcy_ribeiro: 1,
|
|
8
|
+
planaltina: 2,
|
|
9
|
+
ceilandia: 3,
|
|
10
|
+
gama: 4
|
|
11
|
+
}.freeze
|
|
12
|
+
|
|
13
|
+
def self.id(campus)
|
|
14
|
+
raise ArgumentError, "Campus: #{campus} not in: #{CAMPUSES.keys}" unless CAMPUSES.include? campus
|
|
15
|
+
|
|
16
|
+
CAMPUSES[campus]
|
|
17
|
+
end
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
# Options module
|
|
21
|
+
module Options
|
|
22
|
+
module_function
|
|
23
|
+
|
|
24
|
+
@log = false
|
|
25
|
+
|
|
26
|
+
def init(options = { log: false })
|
|
27
|
+
@log = options[:log].freeze
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
def log_enabled?
|
|
31
|
+
@log
|
|
32
|
+
end
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
# Helper methods used throughout the lib
|
|
36
|
+
class Helpers
|
|
37
|
+
def self.format_hours(schedules, row = [])
|
|
38
|
+
until schedules.empty?
|
|
39
|
+
schedule = []
|
|
40
|
+
schedule << schedules.shift # DIA
|
|
41
|
+
schedule << schedules.shift # HORARIO DE INICIO
|
|
42
|
+
schedule << schedules.shift # HORARIO DE FIM
|
|
43
|
+
schedules.shift # RETIRANDO LIXO
|
|
44
|
+
schedule << schedules.shift # LOCAL DA AULA
|
|
45
|
+
row << schedule
|
|
46
|
+
end
|
|
47
|
+
row
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
def self.format_teachers(teachers)
|
|
51
|
+
teachers.empty? ? ['A Designar'] : teachers
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
# MODE: TURMAS, CURSOS OU CURRICULO
|
|
55
|
+
def self.set_crawler(id, search_mode, options = { exact: false })
|
|
56
|
+
id = Campuses.id id unless options[:exact]
|
|
57
|
+
url = SITE + search_mode + id.to_s
|
|
58
|
+
Nokogiri::HTML(URI.parse(url).open)
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
def self.write_json(file_name, object)
|
|
62
|
+
File.open(file_name, 'w+') do |f|
|
|
63
|
+
f.write object.to_json
|
|
64
|
+
end
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
def self.log(msg)
|
|
68
|
+
puts msg if Options.log_enabled?
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
def self.uri_query_params(uri)
|
|
72
|
+
query_string = URI.parse(uri).query
|
|
73
|
+
query_string.split('&').map { |param| param.split('=') }.to_h
|
|
74
|
+
end
|
|
75
|
+
end
|
|
76
|
+
end
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Mwcrawler
|
|
4
|
+
# Scraps Subjects by department
|
|
5
|
+
module Subjects
|
|
6
|
+
def self.scrap(department_or_id, options)
|
|
7
|
+
if options[:by_id]
|
|
8
|
+
subject_by_id(department_or_id)
|
|
9
|
+
elsif options[:by_department]
|
|
10
|
+
subject_by_department(department_or_id)
|
|
11
|
+
else
|
|
12
|
+
raise ArgumentError, 'second argument not specified. You can find a subject by department code or id'
|
|
13
|
+
end
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
private_class_method def self.subject_by_department(department)
|
|
17
|
+
page = Helpers.set_crawler(department, 'graduacao/oferta_dis.aspx?cod=', exact: true)
|
|
18
|
+
scrap_row(department, page)
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
private_class_method def self.subject_by_id(id)
|
|
22
|
+
page = Helpers.set_crawler(id, 'graduacao/oferta_dados.aspx?cod=', exact: true)
|
|
23
|
+
row_init_by_id(page)
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
private_class_method def self.row_init_by_id(page)
|
|
27
|
+
{ code: page.css('#datatable')[0].css('tr:nth-child(2) td').text.to_i,
|
|
28
|
+
name: page.css('#datatable')[0].css('tr:nth-child(3) td').text,
|
|
29
|
+
department: page.css('#datatable tr:first-child a').first['href'].scan(/\d+/)[0].to_i,
|
|
30
|
+
level: 'graduação' }
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
private_class_method def self.scrap_row(dep_code, page)
|
|
34
|
+
subjects = []
|
|
35
|
+
length = page.css('#datatable tr td:nth-child(1)').count
|
|
36
|
+
length.times do |i|
|
|
37
|
+
subjects << row_init_by_department(page, dep_code, i)
|
|
38
|
+
end
|
|
39
|
+
subjects
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
private_class_method def self.row_init_by_department(page, dep_code, index)
|
|
43
|
+
{ code: page.css('#datatable tr td:nth-child(1)').map(&:text)[index].to_i,
|
|
44
|
+
name: page.css('#datatable tr td:nth-child(2)').map(&:text)[index],
|
|
45
|
+
department: dep_code.to_i,
|
|
46
|
+
level: 'graduação' }
|
|
47
|
+
end
|
|
48
|
+
end
|
|
49
|
+
end
|
data/mwcrawler.gemspec
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
lib = File.expand_path('lib', __dir__)
|
|
2
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
|
3
|
+
require 'mwcrawler/version'
|
|
4
|
+
|
|
5
|
+
Gem::Specification.new do |spec|
|
|
6
|
+
spec.name = 'mwcrawler'
|
|
7
|
+
spec.version = Mwcrawler::VERSION
|
|
8
|
+
spec.authors = ['Danilo de Lima', 'vitor pontes']
|
|
9
|
+
spec.email = ['vitormax2005@hotmail.com']
|
|
10
|
+
|
|
11
|
+
spec.summary = 'Gema para webscrapping do sistemas de matriculas da unb Matricula Web.'
|
|
12
|
+
spec.description = 'Essa gema provê uma api ruby para se fazer o scrapping de páginas html do sistema matricula web e retornar um conteudo que pode ser mais facilmente processado pelo programa'
|
|
13
|
+
spec.homepage = 'https://github.com/danilodelyima/mwcrawler'
|
|
14
|
+
spec.license = 'MIT'
|
|
15
|
+
|
|
16
|
+
# Prevent pushing this gem to RubyGems.org. To allow pushes either set the 'allowed_push_host'
|
|
17
|
+
# to allow pushing to a single host or delete this section to allow pushing to any host.
|
|
18
|
+
# if spec.respond_to?(:metadata)
|
|
19
|
+
# spec.metadata['allowed_push_host'] = "TODO: Set to 'http://mygemserver.com'"
|
|
20
|
+
# else
|
|
21
|
+
# raise 'RubyGems 2.0 or newer is required to protect against ' \
|
|
22
|
+
# 'public gem pushes.'
|
|
23
|
+
# end
|
|
24
|
+
|
|
25
|
+
spec.files = `git ls-files -z`.split("\x0").reject do |f|
|
|
26
|
+
f.match(%r{^(test|spec|features)/})
|
|
27
|
+
end
|
|
28
|
+
spec.bindir = 'exe'
|
|
29
|
+
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
|
30
|
+
spec.require_paths = ['lib']
|
|
31
|
+
|
|
32
|
+
spec.add_development_dependency 'bundler', '~> 1.16'
|
|
33
|
+
spec.add_development_dependency 'coveralls'
|
|
34
|
+
spec.add_development_dependency 'pry', '~> 0.11'
|
|
35
|
+
spec.add_development_dependency 'rake', '~> 13.0'
|
|
36
|
+
spec.add_development_dependency 'rspec', '~> 3.0'
|
|
37
|
+
spec.add_development_dependency 'vcr', '~> 4.0'
|
|
38
|
+
spec.add_development_dependency 'webmock', '~> 3.4'
|
|
39
|
+
spec.add_development_dependency 'rubocop', '~> 0.59.2'
|
|
40
|
+
spec.add_dependency 'nokogiri', '~> 1.8'
|
|
41
|
+
end
|
metadata
ADDED
|
@@ -0,0 +1,196 @@
|
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
|
2
|
+
name: mwcrawler
|
|
3
|
+
version: !ruby/object:Gem::Version
|
|
4
|
+
version: 0.1.0
|
|
5
|
+
platform: ruby
|
|
6
|
+
authors:
|
|
7
|
+
- Danilo de Lima
|
|
8
|
+
- vitor pontes
|
|
9
|
+
autorequire:
|
|
10
|
+
bindir: exe
|
|
11
|
+
cert_chain: []
|
|
12
|
+
date: 2020-05-29 00:00:00.000000000 Z
|
|
13
|
+
dependencies:
|
|
14
|
+
- !ruby/object:Gem::Dependency
|
|
15
|
+
name: bundler
|
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
|
17
|
+
requirements:
|
|
18
|
+
- - "~>"
|
|
19
|
+
- !ruby/object:Gem::Version
|
|
20
|
+
version: '1.16'
|
|
21
|
+
type: :development
|
|
22
|
+
prerelease: false
|
|
23
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
24
|
+
requirements:
|
|
25
|
+
- - "~>"
|
|
26
|
+
- !ruby/object:Gem::Version
|
|
27
|
+
version: '1.16'
|
|
28
|
+
- !ruby/object:Gem::Dependency
|
|
29
|
+
name: coveralls
|
|
30
|
+
requirement: !ruby/object:Gem::Requirement
|
|
31
|
+
requirements:
|
|
32
|
+
- - ">="
|
|
33
|
+
- !ruby/object:Gem::Version
|
|
34
|
+
version: '0'
|
|
35
|
+
type: :development
|
|
36
|
+
prerelease: false
|
|
37
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
38
|
+
requirements:
|
|
39
|
+
- - ">="
|
|
40
|
+
- !ruby/object:Gem::Version
|
|
41
|
+
version: '0'
|
|
42
|
+
- !ruby/object:Gem::Dependency
|
|
43
|
+
name: pry
|
|
44
|
+
requirement: !ruby/object:Gem::Requirement
|
|
45
|
+
requirements:
|
|
46
|
+
- - "~>"
|
|
47
|
+
- !ruby/object:Gem::Version
|
|
48
|
+
version: '0.11'
|
|
49
|
+
type: :development
|
|
50
|
+
prerelease: false
|
|
51
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
52
|
+
requirements:
|
|
53
|
+
- - "~>"
|
|
54
|
+
- !ruby/object:Gem::Version
|
|
55
|
+
version: '0.11'
|
|
56
|
+
- !ruby/object:Gem::Dependency
|
|
57
|
+
name: rake
|
|
58
|
+
requirement: !ruby/object:Gem::Requirement
|
|
59
|
+
requirements:
|
|
60
|
+
- - "~>"
|
|
61
|
+
- !ruby/object:Gem::Version
|
|
62
|
+
version: '13.0'
|
|
63
|
+
type: :development
|
|
64
|
+
prerelease: false
|
|
65
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
66
|
+
requirements:
|
|
67
|
+
- - "~>"
|
|
68
|
+
- !ruby/object:Gem::Version
|
|
69
|
+
version: '13.0'
|
|
70
|
+
- !ruby/object:Gem::Dependency
|
|
71
|
+
name: rspec
|
|
72
|
+
requirement: !ruby/object:Gem::Requirement
|
|
73
|
+
requirements:
|
|
74
|
+
- - "~>"
|
|
75
|
+
- !ruby/object:Gem::Version
|
|
76
|
+
version: '3.0'
|
|
77
|
+
type: :development
|
|
78
|
+
prerelease: false
|
|
79
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
80
|
+
requirements:
|
|
81
|
+
- - "~>"
|
|
82
|
+
- !ruby/object:Gem::Version
|
|
83
|
+
version: '3.0'
|
|
84
|
+
- !ruby/object:Gem::Dependency
|
|
85
|
+
name: vcr
|
|
86
|
+
requirement: !ruby/object:Gem::Requirement
|
|
87
|
+
requirements:
|
|
88
|
+
- - "~>"
|
|
89
|
+
- !ruby/object:Gem::Version
|
|
90
|
+
version: '4.0'
|
|
91
|
+
type: :development
|
|
92
|
+
prerelease: false
|
|
93
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
94
|
+
requirements:
|
|
95
|
+
- - "~>"
|
|
96
|
+
- !ruby/object:Gem::Version
|
|
97
|
+
version: '4.0'
|
|
98
|
+
- !ruby/object:Gem::Dependency
|
|
99
|
+
name: webmock
|
|
100
|
+
requirement: !ruby/object:Gem::Requirement
|
|
101
|
+
requirements:
|
|
102
|
+
- - "~>"
|
|
103
|
+
- !ruby/object:Gem::Version
|
|
104
|
+
version: '3.4'
|
|
105
|
+
type: :development
|
|
106
|
+
prerelease: false
|
|
107
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
108
|
+
requirements:
|
|
109
|
+
- - "~>"
|
|
110
|
+
- !ruby/object:Gem::Version
|
|
111
|
+
version: '3.4'
|
|
112
|
+
- !ruby/object:Gem::Dependency
|
|
113
|
+
name: rubocop
|
|
114
|
+
requirement: !ruby/object:Gem::Requirement
|
|
115
|
+
requirements:
|
|
116
|
+
- - "~>"
|
|
117
|
+
- !ruby/object:Gem::Version
|
|
118
|
+
version: 0.59.2
|
|
119
|
+
type: :development
|
|
120
|
+
prerelease: false
|
|
121
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
122
|
+
requirements:
|
|
123
|
+
- - "~>"
|
|
124
|
+
- !ruby/object:Gem::Version
|
|
125
|
+
version: 0.59.2
|
|
126
|
+
- !ruby/object:Gem::Dependency
|
|
127
|
+
name: nokogiri
|
|
128
|
+
requirement: !ruby/object:Gem::Requirement
|
|
129
|
+
requirements:
|
|
130
|
+
- - "~>"
|
|
131
|
+
- !ruby/object:Gem::Version
|
|
132
|
+
version: '1.8'
|
|
133
|
+
type: :runtime
|
|
134
|
+
prerelease: false
|
|
135
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
136
|
+
requirements:
|
|
137
|
+
- - "~>"
|
|
138
|
+
- !ruby/object:Gem::Version
|
|
139
|
+
version: '1.8'
|
|
140
|
+
description: Essa gema provê uma api ruby para se fazer o scrapping de páginas html
|
|
141
|
+
do sistema matricula web e retornar um conteudo que pode ser mais facilmente processado
|
|
142
|
+
pelo programa
|
|
143
|
+
email:
|
|
144
|
+
- vitormax2005@hotmail.com
|
|
145
|
+
executables: []
|
|
146
|
+
extensions: []
|
|
147
|
+
extra_rdoc_files: []
|
|
148
|
+
files:
|
|
149
|
+
- ".gitignore"
|
|
150
|
+
- ".rspec"
|
|
151
|
+
- ".rubocop.yml"
|
|
152
|
+
- ".ruby-version"
|
|
153
|
+
- ".travis.yml"
|
|
154
|
+
- Gemfile
|
|
155
|
+
- Gemfile.lock
|
|
156
|
+
- LICENSE.txt
|
|
157
|
+
- README.md
|
|
158
|
+
- Rakefile
|
|
159
|
+
- TOTAL.txt
|
|
160
|
+
- bin/console
|
|
161
|
+
- bin/setup
|
|
162
|
+
- lib/mwcrawler.rb
|
|
163
|
+
- lib/mwcrawler/classes.rb
|
|
164
|
+
- lib/mwcrawler/courses.rb
|
|
165
|
+
- lib/mwcrawler/crawler.rb
|
|
166
|
+
- lib/mwcrawler/curriculum.rb
|
|
167
|
+
- lib/mwcrawler/departments.rb
|
|
168
|
+
- lib/mwcrawler/helpers.rb
|
|
169
|
+
- lib/mwcrawler/subjects.rb
|
|
170
|
+
- lib/mwcrawler/version.rb
|
|
171
|
+
- mwcrawler.gemspec
|
|
172
|
+
homepage: https://github.com/danilodelyima/mwcrawler
|
|
173
|
+
licenses:
|
|
174
|
+
- MIT
|
|
175
|
+
metadata: {}
|
|
176
|
+
post_install_message:
|
|
177
|
+
rdoc_options: []
|
|
178
|
+
require_paths:
|
|
179
|
+
- lib
|
|
180
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
|
181
|
+
requirements:
|
|
182
|
+
- - ">="
|
|
183
|
+
- !ruby/object:Gem::Version
|
|
184
|
+
version: '0'
|
|
185
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
186
|
+
requirements:
|
|
187
|
+
- - ">="
|
|
188
|
+
- !ruby/object:Gem::Version
|
|
189
|
+
version: '0'
|
|
190
|
+
requirements: []
|
|
191
|
+
rubyforge_project:
|
|
192
|
+
rubygems_version: 2.7.6
|
|
193
|
+
signing_key:
|
|
194
|
+
specification_version: 4
|
|
195
|
+
summary: Gema para webscrapping do sistemas de matriculas da unb Matricula Web.
|
|
196
|
+
test_files: []
|