history_scraper 1.0.6
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/lib/history_scraper.rb +66 -0
- metadata +59 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 3246785d0b76adc995bf2b8f68690f70eb82bb4f
|
4
|
+
data.tar.gz: d58132b79601c8132d2daaede729462b1aade7da
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 6ec98869c9822ca7e1da3db137ddf13b455d535d63ae8798ff060d20b326f0e0e58ccb1f2702ea30f36661614359ecb74b551fc5b6509a4d6417533acc29213d
|
7
|
+
data.tar.gz: 231add83bd0beae6abb2ebecc3dd48dcc4824f48d3878f9c0370542579d6518efcee8f3e02c1745689363fe18b6c5657ffcde4f70fe9fccaf858d753c2489fa7
|
@@ -0,0 +1,66 @@
|
|
1
|
+
require 'open-uri'
|
2
|
+
require 'nokogiri'
|
3
|
+
require 'date'
|
4
|
+
require 'json'
|
5
|
+
|
6
|
+
def scrap_year(output_dir)
|
7
|
+
result = {}
|
8
|
+
(1..12).each do |month_index|
|
9
|
+
(1..31).each do |day_index|
|
10
|
+
begin
|
11
|
+
day, month = form_date(day_index, month_index)
|
12
|
+
|
13
|
+
puts "Scraping #{month} #{day}..."
|
14
|
+
|
15
|
+
description, events, births, deaths = extract_from(day, month)
|
16
|
+
|
17
|
+
result["#{month}-#{day}".to_sym] = {
|
18
|
+
description: description, events: events, births: births, deaths: deaths
|
19
|
+
}
|
20
|
+
rescue NoMethodError
|
21
|
+
puts 'It seems this date does not have any episodes.'
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
export_to_file(result, output_dir)
|
27
|
+
end
|
28
|
+
|
29
|
+
private
|
30
|
+
|
31
|
+
def form_date(day_index, month_index)
|
32
|
+
date = Date._strptime("#{day_index}/#{month_index}", '%d/%m')
|
33
|
+
[date[:mday], Date::MONTHNAMES[date[:mon]]]
|
34
|
+
end
|
35
|
+
|
36
|
+
def extract_from(day, month)
|
37
|
+
html = Nokogiri::HTML open("https://en.wikipedia.org/wiki/#{month}_#{day}")
|
38
|
+
|
39
|
+
description = html.css('#mw-content-text p')
|
40
|
+
.map(&:text)
|
41
|
+
.find { |text| text.include?("#{month} #{day}") }
|
42
|
+
|
43
|
+
events = parse_ul html.css('#Events')[0].parent.next_element
|
44
|
+
births = parse_ul html.css('#Births')[0].parent.next_element
|
45
|
+
deaths = parse_ul html.css('#Deaths')[0].parent.next_element
|
46
|
+
|
47
|
+
[description, events.compact, births.compact, deaths.compact]
|
48
|
+
end
|
49
|
+
|
50
|
+
def parse_ul(ul)
|
51
|
+
ul.css('li').map do |li|
|
52
|
+
year, *text = li.text.split(' – ')
|
53
|
+
|
54
|
+
next unless year && !text.empty?
|
55
|
+
{ year: year, data: text.join(' – '), kw: parse_keywords(li) }
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
def parse_keywords(li)
|
60
|
+
li.css('a').map { |a| { title: a['title'], href: a['href'] } }
|
61
|
+
end
|
62
|
+
|
63
|
+
def export_to_file(hash_data, dir)
|
64
|
+
File.write(dir, hash_data.to_json)
|
65
|
+
puts "Results stored in #{dir}"
|
66
|
+
end
|
metadata
ADDED
@@ -0,0 +1,59 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: history_scraper
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.0.6
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Sebastián Salata
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2017-05-06 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: nokogiri
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '1.7'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '1.7'
|
27
|
+
description: Scraps events, births and deaths that occured during a specific day of
|
28
|
+
history.
|
29
|
+
email: sa.salatart@gmail.com
|
30
|
+
executables: []
|
31
|
+
extensions: []
|
32
|
+
extra_rdoc_files: []
|
33
|
+
files:
|
34
|
+
- lib/history_scraper.rb
|
35
|
+
homepage: https://github.com/sasalatart/history-scraper-rb
|
36
|
+
licenses:
|
37
|
+
- MIT
|
38
|
+
metadata: {}
|
39
|
+
post_install_message:
|
40
|
+
rdoc_options: []
|
41
|
+
require_paths:
|
42
|
+
- lib
|
43
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - ">="
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
49
|
+
requirements:
|
50
|
+
- - ">="
|
51
|
+
- !ruby/object:Gem::Version
|
52
|
+
version: '0'
|
53
|
+
requirements: []
|
54
|
+
rubyforge_project:
|
55
|
+
rubygems_version: 2.6.12
|
56
|
+
signing_key:
|
57
|
+
specification_version: 4
|
58
|
+
summary: Wikipedia history scraper.
|
59
|
+
test_files: []
|