history_scraper 1.0.6

Sign up to get free protection for your applications and to get access to all the features.
Files changed (3) hide show
  1. checksums.yaml +7 -0
  2. data/lib/history_scraper.rb +66 -0
  3. metadata +59 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 3246785d0b76adc995bf2b8f68690f70eb82bb4f
4
+ data.tar.gz: d58132b79601c8132d2daaede729462b1aade7da
5
+ SHA512:
6
+ metadata.gz: 6ec98869c9822ca7e1da3db137ddf13b455d535d63ae8798ff060d20b326f0e0e58ccb1f2702ea30f36661614359ecb74b551fc5b6509a4d6417533acc29213d
7
+ data.tar.gz: 231add83bd0beae6abb2ebecc3dd48dcc4824f48d3878f9c0370542579d6518efcee8f3e02c1745689363fe18b6c5657ffcde4f70fe9fccaf858d753c2489fa7
@@ -0,0 +1,66 @@
1
+ require 'open-uri'
2
+ require 'nokogiri'
3
+ require 'date'
4
+ require 'json'
5
+
6
+ def scrap_year(output_dir)
7
+ result = {}
8
+ (1..12).each do |month_index|
9
+ (1..31).each do |day_index|
10
+ begin
11
+ day, month = form_date(day_index, month_index)
12
+
13
+ puts "Scraping #{month} #{day}..."
14
+
15
+ description, events, births, deaths = extract_from(day, month)
16
+
17
+ result["#{month}-#{day}".to_sym] = {
18
+ description: description, events: events, births: births, deaths: deaths
19
+ }
20
+ rescue NoMethodError
21
+ puts 'It seems this date does not have any episodes.'
22
+ end
23
+ end
24
+ end
25
+
26
+ export_to_file(result, output_dir)
27
+ end
28
+
29
+ private
30
+
31
+ def form_date(day_index, month_index)
32
+ date = Date._strptime("#{day_index}/#{month_index}", '%d/%m')
33
+ [date[:mday], Date::MONTHNAMES[date[:mon]]]
34
+ end
35
+
36
+ def extract_from(day, month)
37
+ html = Nokogiri::HTML open("https://en.wikipedia.org/wiki/#{month}_#{day}")
38
+
39
+ description = html.css('#mw-content-text p')
40
+ .map(&:text)
41
+ .find { |text| text.include?("#{month} #{day}") }
42
+
43
+ events = parse_ul html.css('#Events')[0].parent.next_element
44
+ births = parse_ul html.css('#Births')[0].parent.next_element
45
+ deaths = parse_ul html.css('#Deaths')[0].parent.next_element
46
+
47
+ [description, events.compact, births.compact, deaths.compact]
48
+ end
49
+
50
+ def parse_ul(ul)
51
+ ul.css('li').map do |li|
52
+ year, *text = li.text.split(' – ')
53
+
54
+ next unless year && !text.empty?
55
+ { year: year, data: text.join(' – '), kw: parse_keywords(li) }
56
+ end
57
+ end
58
+
59
+ def parse_keywords(li)
60
+ li.css('a').map { |a| { title: a['title'], href: a['href'] } }
61
+ end
62
+
63
+ def export_to_file(hash_data, dir)
64
+ File.write(dir, hash_data.to_json)
65
+ puts "Results stored in #{dir}"
66
+ end
metadata ADDED
@@ -0,0 +1,59 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: history_scraper
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.6
5
+ platform: ruby
6
+ authors:
7
+ - Sebastián Salata
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2017-05-06 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: nokogiri
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.7'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.7'
27
+ description: Scraps events, births and deaths that occured during a specific day of
28
+ history.
29
+ email: sa.salatart@gmail.com
30
+ executables: []
31
+ extensions: []
32
+ extra_rdoc_files: []
33
+ files:
34
+ - lib/history_scraper.rb
35
+ homepage: https://github.com/sasalatart/history-scraper-rb
36
+ licenses:
37
+ - MIT
38
+ metadata: {}
39
+ post_install_message:
40
+ rdoc_options: []
41
+ require_paths:
42
+ - lib
43
+ required_ruby_version: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ required_rubygems_version: !ruby/object:Gem::Requirement
49
+ requirements:
50
+ - - ">="
51
+ - !ruby/object:Gem::Version
52
+ version: '0'
53
+ requirements: []
54
+ rubyforge_project:
55
+ rubygems_version: 2.6.12
56
+ signing_key:
57
+ specification_version: 4
58
+ summary: Wikipedia history scraper.
59
+ test_files: []