history_scraper 1.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. checksums.yaml +7 -0
  2. data/lib/history_scraper.rb +66 -0
  3. metadata +59 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 3246785d0b76adc995bf2b8f68690f70eb82bb4f
4
+ data.tar.gz: d58132b79601c8132d2daaede729462b1aade7da
5
+ SHA512:
6
+ metadata.gz: 6ec98869c9822ca7e1da3db137ddf13b455d535d63ae8798ff060d20b326f0e0e58ccb1f2702ea30f36661614359ecb74b551fc5b6509a4d6417533acc29213d
7
+ data.tar.gz: 231add83bd0beae6abb2ebecc3dd48dcc4824f48d3878f9c0370542579d6518efcee8f3e02c1745689363fe18b6c5657ffcde4f70fe9fccaf858d753c2489fa7
@@ -0,0 +1,66 @@
1
+ require 'open-uri'
2
+ require 'nokogiri'
3
+ require 'date'
4
+ require 'json'
5
+
6
+ def scrap_year(output_dir)
7
+ result = {}
8
+ (1..12).each do |month_index|
9
+ (1..31).each do |day_index|
10
+ begin
11
+ day, month = form_date(day_index, month_index)
12
+
13
+ puts "Scraping #{month} #{day}..."
14
+
15
+ description, events, births, deaths = extract_from(day, month)
16
+
17
+ result["#{month}-#{day}".to_sym] = {
18
+ description: description, events: events, births: births, deaths: deaths
19
+ }
20
+ rescue NoMethodError
21
+ puts 'It seems this date does not have any episodes.'
22
+ end
23
+ end
24
+ end
25
+
26
+ export_to_file(result, output_dir)
27
+ end
28
+
29
+ private
30
+
31
+ def form_date(day_index, month_index)
32
+ date = Date._strptime("#{day_index}/#{month_index}", '%d/%m')
33
+ [date[:mday], Date::MONTHNAMES[date[:mon]]]
34
+ end
35
+
36
+ def extract_from(day, month)
37
+ html = Nokogiri::HTML open("https://en.wikipedia.org/wiki/#{month}_#{day}")
38
+
39
+ description = html.css('#mw-content-text p')
40
+ .map(&:text)
41
+ .find { |text| text.include?("#{month} #{day}") }
42
+
43
+ events = parse_ul html.css('#Events')[0].parent.next_element
44
+ births = parse_ul html.css('#Births')[0].parent.next_element
45
+ deaths = parse_ul html.css('#Deaths')[0].parent.next_element
46
+
47
+ [description, events.compact, births.compact, deaths.compact]
48
+ end
49
+
50
+ def parse_ul(ul)
51
+ ul.css('li').map do |li|
52
+ year, *text = li.text.split(' – ')
53
+
54
+ next unless year && !text.empty?
55
+ { year: year, data: text.join(' – '), kw: parse_keywords(li) }
56
+ end
57
+ end
58
+
59
+ def parse_keywords(li)
60
+ li.css('a').map { |a| { title: a['title'], href: a['href'] } }
61
+ end
62
+
63
+ def export_to_file(hash_data, dir)
64
+ File.write(dir, hash_data.to_json)
65
+ puts "Results stored in #{dir}"
66
+ end
metadata ADDED
@@ -0,0 +1,59 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: history_scraper
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.6
5
+ platform: ruby
6
+ authors:
7
+ - Sebastián Salata
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2017-05-06 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: nokogiri
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.7'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.7'
27
+ description: Scraps events, births and deaths that occured during a specific day of
28
+ history.
29
+ email: sa.salatart@gmail.com
30
+ executables: []
31
+ extensions: []
32
+ extra_rdoc_files: []
33
+ files:
34
+ - lib/history_scraper.rb
35
+ homepage: https://github.com/sasalatart/history-scraper-rb
36
+ licenses:
37
+ - MIT
38
+ metadata: {}
39
+ post_install_message:
40
+ rdoc_options: []
41
+ require_paths:
42
+ - lib
43
+ required_ruby_version: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ required_rubygems_version: !ruby/object:Gem::Requirement
49
+ requirements:
50
+ - - ">="
51
+ - !ruby/object:Gem::Version
52
+ version: '0'
53
+ requirements: []
54
+ rubyforge_project:
55
+ rubygems_version: 2.6.12
56
+ signing_key:
57
+ specification_version: 4
58
+ summary: Wikipedia history scraper.
59
+ test_files: []