factbook 2.0.0 → 2.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +1 -1
- data/Manifest.txt +0 -61
- data/README.md +8 -506
- data/Rakefile +4 -9
- data/lib/factbook.rb +4 -64
- metadata +6 -124
- data/data/attributes.yml +0 -337
- data/data/categories.csv +0 -164
- data/data/codes.csv +0 -262
- data/data/codesxref.csv +0 -280
- data/data/comparisons.csv +0 -75
- data/lib/factbook/almanac.rb +0 -72
- data/lib/factbook/attributes.rb +0 -74
- data/lib/factbook/builder.rb +0 -212
- data/lib/factbook/builder_item.rb +0 -126
- data/lib/factbook/builder_json.rb +0 -79
- data/lib/factbook/codes.rb +0 -119
- data/lib/factbook/comparisons.rb +0 -50
- data/lib/factbook/counter.rb +0 -48
- data/lib/factbook/db/importer.rb +0 -92
- data/lib/factbook/db/models.rb +0 -11
- data/lib/factbook/db/schema.rb +0 -36
- data/lib/factbook/normalize.rb +0 -43
- data/lib/factbook/page.rb +0 -148
- data/lib/factbook/page_info.rb +0 -12
- data/lib/factbook/reader_json.rb +0 -51
- data/lib/factbook/sanitizer.rb +0 -178
- data/lib/factbook/sect.rb +0 -29
- data/lib/factbook/subsect.rb +0 -18
- data/lib/factbook/table.rb +0 -52
- data/lib/factbook/utils.rb +0 -85
- data/lib/factbook/utils_info.rb +0 -129
- data/lib/factbook/version.rb +0 -21
- data/script/almanac.rb +0 -48
- data/script/attributes.rb +0 -34
- data/script/build.rb +0 -28
- data/script/counter.rb +0 -145
- data/script/json.rb +0 -19
- data/script/testbr.rb +0 -33
- data/script/testcodes.rb +0 -11
- data/test/data/au.html +0 -579
- data/test/data/au.yml +0 -8
- data/test/data/be.html +0 -596
- data/test/data/be.yml +0 -8
- data/test/data/json/au.json +0 -892
- data/test/data/src/ag.html +0 -716
- data/test/data/src/au-2015-09-24.html +0 -2006
- data/test/data/src/au.html +0 -658
- data/test/data/src/be-2015-09-24.html +0 -2011
- data/test/data/src/be.html +0 -648
- data/test/helper.rb +0 -11
- data/test/test_attribs.rb +0 -87
- data/test/test_attribs_def.rb +0 -20
- data/test/test_builder.rb +0 -35
- data/test/test_codes.rb +0 -76
- data/test/test_comparisons.rb +0 -19
- data/test/test_convert.rb +0 -30
- data/test/test_counter.rb +0 -31
- data/test/test_fields.rb +0 -52
- data/test/test_importer.rb +0 -56
- data/test/test_item_builder.rb +0 -99
- data/test/test_json.rb +0 -45
- data/test/test_json_builder.rb +0 -25
- data/test/test_normalize.rb +0 -23
- data/test/test_page.rb +0 -38
- data/test/test_sanitizer.rb +0 -39
- data/test/test_sanitizer_regex.rb +0 -89
data/lib/factbook/page.rb
DELETED
@@ -1,148 +0,0 @@
|
|
1
|
-
|
2
|
-
module Factbook
|
3
|
-
|
4
|
-
|
5
|
-
## note:
|
6
|
-
## some factbook pages with chrome (headers, footers, etc.)
|
7
|
-
## are NOT valid utf-8, thus,
|
8
|
-
## treat page as is (e.g. ASCII8BIT)
|
9
|
-
#
|
10
|
-
# only convert to utf8 when header and footer got stripped
|
11
|
-
|
12
|
-
##
|
13
|
-
## be/benin:
|
14
|
-
## Key Force or FC [Lazare S?xx?HOU?xx?TO] -- two invalid byte code chars in Political parties and leaders:
|
15
|
-
#
|
16
|
-
## in Western/Windows-1252 leads to FC [Lazare SÈHOUÉTO];
|
17
|
-
# Lazare Sèhouéto
|
18
|
-
#
|
19
|
-
# looks good - use (assume) Windows-1252 ????
|
20
|
-
|
21
|
-
##
|
22
|
-
# check for is ascii 7-bit ??? if yes -noworries
|
23
|
-
# if not, log number of chars not using ascii 7-bit
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
class Page
|
28
|
-
include LogUtils::Logging
|
29
|
-
|
30
|
-
attr_reader :sects ## "structured" access e.g. sects/subsects/etc.
|
31
|
-
attr_reader :info ## meta info e.g. country_code, country_name, region_name, last_updated, etc.
|
32
|
-
attr_reader :data ## "plain" access with vanilla hash
|
33
|
-
|
34
|
-
|
35
|
-
## standard version (note: requires https)
|
36
|
-
SITE_BASE = 'https://www.cia.gov/library/publications/the-world-factbook/geos/{code}.html'
|
37
|
-
|
38
|
-
def initialize( code, opts={} )
|
39
|
-
### keep code - why? why not?? (use page_info/info e.g. info.country_code??)
|
40
|
-
|
41
|
-
if opts[:json]
|
42
|
-
json = opts[:json] ## note: json is (still) a string/text (NOT yet parsed to structured data)
|
43
|
-
b = JsonBuilder.from_string( json )
|
44
|
-
else ## assume html
|
45
|
-
if opts[:html] ## note: expects ASCII-7BIT/BINARY encoding
|
46
|
-
## for debugging and testing allow "custom" passed-in html page
|
47
|
-
html = opts[:html]
|
48
|
-
else
|
49
|
-
url_string = SITE_BASE.gsub( '{code}', code )
|
50
|
-
## note: expects ASCII-7BIT/BINARY encoding
|
51
|
-
|
52
|
-
## html = fetch_page( url_string ) ## use PageFetcher class - why?? why not??
|
53
|
-
html = Webcache.read( url_string )
|
54
|
-
end
|
55
|
-
b = Builder.from_string( html )
|
56
|
-
end
|
57
|
-
|
58
|
-
@sects = b.sects
|
59
|
-
@info = b.info
|
60
|
-
|
61
|
-
## todo/fix/quick hack:
|
62
|
-
## check for info opts hash entry - lets you overwrite page info
|
63
|
-
## -- use proper header to setup page info - why, why not??
|
64
|
-
if opts[:info]
|
65
|
-
info = opts[:info]
|
66
|
-
@info = info
|
67
|
-
end
|
68
|
-
|
69
|
-
@data = {}
|
70
|
-
@sects.each do |sect|
|
71
|
-
@data[ sect.title ] = sect.data
|
72
|
-
end
|
73
|
-
|
74
|
-
self ## return self (check - not needed??)
|
75
|
-
end
|
76
|
-
|
77
|
-
|
78
|
-
def to_json( opts={} ) ## convenience helper for data.to_json; note: pretty print by default!
|
79
|
-
if opts[:minify]
|
80
|
-
data.to_json
|
81
|
-
else
|
82
|
-
## was: -- opts[:pretty] || opts[:pp]
|
83
|
-
JSON.pretty_generate( data ) ## note: pretty print by default!
|
84
|
-
end
|
85
|
-
end
|
86
|
-
|
87
|
-
|
88
|
-
def [](key) ### convenience shortcut
|
89
|
-
# lets you use
|
90
|
-
# page['geo']
|
91
|
-
# instead of
|
92
|
-
# page.data['geo']
|
93
|
-
|
94
|
-
## fix: use delegate data, [] from forwardable lib - why?? why not??
|
95
|
-
|
96
|
-
data[key]
|
97
|
-
end
|
98
|
-
|
99
|
-
## add convenience (shortcut) accessors / attributes / fields / getters
|
100
|
-
|
101
|
-
ATTRIBUTES.each do |attrib|
|
102
|
-
## e.g.
|
103
|
-
## def background() data['Introduction']['Background']['text']; end
|
104
|
-
## def location() data['Geography']['Location']['text']; end
|
105
|
-
## etc.
|
106
|
-
if attrib.path.size == 1
|
107
|
-
define_method attrib.name.to_sym do
|
108
|
-
@data.fetch( attrib.category, {} ).
|
109
|
-
fetch( attrib.path[0], {} )['text']
|
110
|
-
end
|
111
|
-
else ## assume size 2 for now
|
112
|
-
define_method attrib.name.to_sym do
|
113
|
-
@data.fetch( attrib.category, {} ).
|
114
|
-
fetch( attrib.path[0], {} ).
|
115
|
-
fetch( attrib.path[1], {} )['text']
|
116
|
-
end
|
117
|
-
end
|
118
|
-
end
|
119
|
-
|
120
|
-
|
121
|
-
private
|
122
|
-
def fetch_page( url )
|
123
|
-
response = Webget.page( url )
|
124
|
-
|
125
|
-
## note: exit on get / fetch error - do NOT continue for now - why? why not?
|
126
|
-
exit 1 if response.status.nok? ## e.g. HTTP status code != 200
|
127
|
-
|
128
|
-
|
129
|
-
response.text
|
130
|
-
end
|
131
|
-
|
132
|
-
|
133
|
-
=begin
|
134
|
-
def self.from_url( cc, cn )
|
135
|
-
html_ascii = PageFetcher.new.fetch( cc )
|
136
|
-
self.new( cc, cn, html_ascii )
|
137
|
-
end
|
138
|
-
|
139
|
-
def self.from_file( cc, cn, opts={} )
|
140
|
-
input_dir = opts[:input_dir] || '.'
|
141
|
-
html_ascii = File.read( "#{input_dir}/#{cc}.html" ) ## fix/todo: use ASCII8BIT/binary reader
|
142
|
-
self.new( cc, cn, html_ascii )
|
143
|
-
end
|
144
|
-
=end
|
145
|
-
|
146
|
-
|
147
|
-
end # class Page
|
148
|
-
end # module Factbook
|
data/lib/factbook/page_info.rb
DELETED
data/lib/factbook/reader_json.rb
DELETED
@@ -1,51 +0,0 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
|
-
module Factbook
|
4
|
-
|
5
|
-
|
6
|
-
class JsonPageReader
|
7
|
-
def initialize( json_dir )
|
8
|
-
@json_dir = json_dir
|
9
|
-
end
|
10
|
-
|
11
|
-
def read_page( code )
|
12
|
-
path = "#{@json_dir}/#{region_to_slug(code.region)}/#{code.code}.json"
|
13
|
-
|
14
|
-
puts "reading #{code.code} #{code.name} (#{code.region}) [#{path}]..."
|
15
|
-
json = File.read( path )
|
16
|
-
|
17
|
-
## todo/fix/quick hack: for now until we have a proper header/meta/info section in json
|
18
|
-
# add some page info from code struct
|
19
|
-
|
20
|
-
info = PageInfo.new
|
21
|
-
info.country_code = code.code
|
22
|
-
info.country_name = code.name
|
23
|
-
info.region_name = code.region
|
24
|
-
|
25
|
-
page = Page.new( code.code, json: json, info: info )
|
26
|
-
page
|
27
|
-
end
|
28
|
-
|
29
|
-
def read_pages( codes, limit: nil )
|
30
|
-
pages = []
|
31
|
-
i=0
|
32
|
-
codes.each do |code|
|
33
|
-
next if limit && i > limit ## for debugging just process first x entries
|
34
|
-
|
35
|
-
pages << read_page( code )
|
36
|
-
end
|
37
|
-
pages
|
38
|
-
end
|
39
|
-
|
40
|
-
private
|
41
|
-
def region_to_slug( text )
|
42
|
-
## change and => n
|
43
|
-
## change & => n
|
44
|
-
## change all spaces to => -
|
45
|
-
## e.g. East & Southeast Asia => east-n-southeast-asia
|
46
|
-
## Central America and Caribbean => central-america-n-caribbean
|
47
|
-
text.downcase.gsub('and', 'n').gsub( '&', 'n' ).gsub( ' ', '-' )
|
48
|
-
end
|
49
|
-
end ## JsonPageReader
|
50
|
-
|
51
|
-
end # module Factbook
|
data/lib/factbook/sanitizer.rb
DELETED
@@ -1,178 +0,0 @@
|
|
1
|
-
|
2
|
-
module Factbook
|
3
|
-
|
4
|
-
class Sanitizer
|
5
|
-
include LogUtils::Logging
|
6
|
-
include Utils ## pulls in encode_utf8, ...
|
7
|
-
|
8
|
-
|
9
|
-
def sanitize( html_ascii )
|
10
|
-
## todo: add option for (html source) encoding - why?? why not??
|
11
|
-
|
12
|
-
## note:
|
13
|
-
## returns 1) html profile withouth headers, footers, scripts,etc.
|
14
|
-
## 2) page (meta) info e.g. country_name, country_code, last_updated, etc.
|
15
|
-
## 3) errors e.g. list of errors e.g. endcoding errors (invalid byte sequence etc.)
|
16
|
-
|
17
|
-
page_info = PageInfo.new
|
18
|
-
|
19
|
-
## todo:
|
20
|
-
## make page info optional? why? why not?
|
21
|
-
## not always available (if page structure changes) - check
|
22
|
-
## what page info is required??
|
23
|
-
h = find_page_info( html_ascii )
|
24
|
-
if h
|
25
|
-
page_info.country_code = h[:country_code]
|
26
|
-
page_info.country_name = h[:country_name]
|
27
|
-
page_info.country_affiliation = h[:country_affiliation]
|
28
|
-
page_info.region_code = h[:region_code]
|
29
|
-
page_info.region_name = h[:region_name]
|
30
|
-
else
|
31
|
-
page_info.country_code = find_country_code( html_ascii )
|
32
|
-
## print/warn: no page info found
|
33
|
-
end
|
34
|
-
|
35
|
-
|
36
|
-
page_info.last_updated = find_page_last_updated( html_ascii )
|
37
|
-
|
38
|
-
|
39
|
-
html = find_country_profile( html_ascii ) ## cut-off headers, footers, scripts, etc.
|
40
|
-
|
41
|
-
## todo/fix: assume windows 12xx encoding!!!! for factbook - try
|
42
|
-
# html, errors = encode_utf8( html_profile_ascii ) ## change encoding to utf-8 (from binary/ascii8bit)
|
43
|
-
|
44
|
-
# html = sanitize_profile( html )
|
45
|
-
|
46
|
-
[html, page_info, []]
|
47
|
-
end
|
48
|
-
|
49
|
-
|
50
|
-
#
|
51
|
-
# <span class="subfield-date" aria-label="Date of information: 2018">(2018)</span>
|
52
|
-
#
|
53
|
-
# remove aria labels
|
54
|
-
ARIA_ATTR_REGEX = /\s*
|
55
|
-
aria-label=('|").+?\1 ## note: use non-greedy match e.g. .+?
|
56
|
-
/xim ## do NOT allow multi-line - why? why not?
|
57
|
-
|
58
|
-
|
59
|
-
def find_country_profile( html )
|
60
|
-
####
|
61
|
-
## remove header (everything before)
|
62
|
-
## <ul class="expandcollapse">
|
63
|
-
|
64
|
-
doc = Nokogiri::HTML( html )
|
65
|
-
|
66
|
-
ul = doc.css( 'ul.expandcollapse' )[0]
|
67
|
-
|
68
|
-
puts ul.to_html[0..100]
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
## note: special case cc uses h2 instead of div block
|
73
|
-
## <h2 class="question cam_med" sectiontitle="Introduction" ccode="cc"
|
74
|
-
## style="border-bottom: 2px solid white; cursor: pointer;">
|
75
|
-
## Introduction :: <span class="region">CURACAO </span>
|
76
|
-
## </h2>
|
77
|
-
## is old format !!!!
|
78
|
-
## cc - CURACAO
|
79
|
-
## http headers says - last-modified: Wed, 14 Nov 2018 14:09:28 GMT
|
80
|
-
## page says - PAGE LAST UPDATED ON MARCH 14, 2018
|
81
|
-
## wait for new version to be generated / pushed!!!
|
82
|
-
|
83
|
-
## check for old format if h2 are present
|
84
|
-
h2s = ul.css( 'h2' )
|
85
|
-
if h2s.size > 0
|
86
|
-
puts " !! WARN: found #{h2s.size} h2(s) - assume old format - sorry - must wait for update!!!"
|
87
|
-
## return empty html string - why? why not?
|
88
|
-
return ''
|
89
|
-
end
|
90
|
-
|
91
|
-
|
92
|
-
###
|
93
|
-
## sanitize
|
94
|
-
|
95
|
-
## remove link items
|
96
|
-
## assume two <li>s are a section
|
97
|
-
|
98
|
-
html = String.new('')
|
99
|
-
|
100
|
-
## filter all li's
|
101
|
-
ul_children = ul.children.select { |el| if el.name == 'li'
|
102
|
-
true
|
103
|
-
else
|
104
|
-
# puts "skipping #{el.name} >#{el.to_html}<"
|
105
|
-
false
|
106
|
-
end
|
107
|
-
}
|
108
|
-
puts " #{ul_children.size} li(s):"
|
109
|
-
ul_children.each_slice(2) do |lis|
|
110
|
-
li = lis[0]
|
111
|
-
div = li.at( 'div[sectiontitle]' )
|
112
|
-
if div.nil?
|
113
|
-
puts "!! ERROR: no section title found in div:"
|
114
|
-
puts li.to_html
|
115
|
-
exit 1
|
116
|
-
end
|
117
|
-
|
118
|
-
section_title = div['sectiontitle'].to_s
|
119
|
-
|
120
|
-
html << "<h2>#{section_title}</h2>\n"
|
121
|
-
|
122
|
-
|
123
|
-
li = lis[1]
|
124
|
-
## filter all div's
|
125
|
-
li_children = li.children.select { |el| if el.name =='div'
|
126
|
-
true
|
127
|
-
else
|
128
|
-
# puts "skipping #{el.name} >#{el.to_html}<"
|
129
|
-
false
|
130
|
-
end
|
131
|
-
}
|
132
|
-
puts " #{li_children.size} div(s):"
|
133
|
-
|
134
|
-
li_children.each_slice(2) do |divs|
|
135
|
-
div = divs[0]
|
136
|
-
a = div.css('a')[0]
|
137
|
-
|
138
|
-
if a
|
139
|
-
html << "\n<h3>#{a.text}:</h3>\n"
|
140
|
-
else
|
141
|
-
puts "!! WARN: no anchor found:"
|
142
|
-
puts div.to_html
|
143
|
-
end
|
144
|
-
|
145
|
-
|
146
|
-
div = divs[1]
|
147
|
-
div_children = div.children.select {|el| el.name == 'div' ? true : false }
|
148
|
-
div_children.each do |catdiv|
|
149
|
-
if catdiv['class'] && catdiv['class'].index( 'category_data' )
|
150
|
-
|
151
|
-
if catdiv['class'].index( 'attachment' )
|
152
|
-
## skip attachments e.g. maps, pop pyramids, etc.
|
153
|
-
else
|
154
|
-
html << catdiv.to_html
|
155
|
-
html << "\n"
|
156
|
-
end
|
157
|
-
else
|
158
|
-
puts "!! WARN: skipping div (W/O category_data class):"
|
159
|
-
puts catdiv.to_html
|
160
|
-
end
|
161
|
-
end
|
162
|
-
end
|
163
|
-
end
|
164
|
-
|
165
|
-
|
166
|
-
html = html.gsub( ARIA_ATTR_REGEX ) do |m|
|
167
|
-
puts "remove aria-label attr:"
|
168
|
-
puts "#{m}"
|
169
|
-
''
|
170
|
-
end
|
171
|
-
|
172
|
-
html
|
173
|
-
end
|
174
|
-
|
175
|
-
|
176
|
-
end # class Sanitizer
|
177
|
-
|
178
|
-
end # module Factbook
|
data/lib/factbook/sect.rb
DELETED
@@ -1,29 +0,0 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
|
-
module Factbook
|
4
|
-
|
5
|
-
|
6
|
-
class Sect
|
7
|
-
include LogUtils::Logging
|
8
|
-
|
9
|
-
attr_accessor :title ## use name instead of title - why? why not?
|
10
|
-
attr_accessor :subsects
|
11
|
-
|
12
|
-
def initialize
|
13
|
-
@subsects = []
|
14
|
-
end
|
15
|
-
|
16
|
-
def data
|
17
|
-
## convert sects to hash
|
18
|
-
@data = {}
|
19
|
-
|
20
|
-
subsects.each_with_index do |subsect,i|
|
21
|
-
@data[ subsect.title ] = subsect.data
|
22
|
-
end
|
23
|
-
@data
|
24
|
-
end
|
25
|
-
|
26
|
-
|
27
|
-
end # class Sect
|
28
|
-
|
29
|
-
end # module Factbook
|
data/lib/factbook/subsect.rb
DELETED
@@ -1,18 +0,0 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
|
-
module Factbook
|
4
|
-
|
5
|
-
|
6
|
-
class Subsect
|
7
|
-
include LogUtils::Logging
|
8
|
-
|
9
|
-
attr_accessor :title ## use name instead of title - why? why not?
|
10
|
-
attr_accessor :data ## hash holding data e.g. { 'text' => '...' etc. }
|
11
|
-
|
12
|
-
def initialize
|
13
|
-
@data = {}
|
14
|
-
end
|
15
|
-
|
16
|
-
end # class Subsect
|
17
|
-
|
18
|
-
end # module Factbook
|
data/lib/factbook/table.rb
DELETED
@@ -1,52 +0,0 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
|
-
module Factbook
|
4
|
-
|
5
|
-
##
|
6
|
-
## make more "generic" - why? why not?
|
7
|
-
## (re)use for other files ?? move to textutils ??
|
8
|
-
|
9
|
-
##
|
10
|
-
## for now reads in rows with values separated by at least 3+ spaces e.g.:
|
11
|
-
## see www.cia.gov/library/publications/the-world-factbook/rankorder/rawdata_2119.txt
|
12
|
-
## 1 China 1,367,485,388
|
13
|
-
## 2 India 1,251,695,584
|
14
|
-
## 3 European Union 513,949,445
|
15
|
-
## 4 United States 321,368,864
|
16
|
-
## 5 Indonesia 255,993,674
|
17
|
-
## 6 Brazil 204,259,812
|
18
|
-
|
19
|
-
|
20
|
-
class TableReader
|
21
|
-
include LogUtils::Logging
|
22
|
-
|
23
|
-
|
24
|
-
def initialize( text )
|
25
|
-
@text = text
|
26
|
-
end
|
27
|
-
|
28
|
-
def read
|
29
|
-
recs = []
|
30
|
-
|
31
|
-
line_no = 0
|
32
|
-
@text.each_line do |line|
|
33
|
-
line_no +=1
|
34
|
-
line = line.strip ## remove leading and trailing whitespace
|
35
|
-
if line.empty?
|
36
|
-
puts "** skipping empty line #{line_no}"
|
37
|
-
next
|
38
|
-
end
|
39
|
-
|
40
|
-
values = line.split( /[ ]{3,}/ ) ## split three or more spaces - use just two ?? why? why not??
|
41
|
-
|
42
|
-
## puts line
|
43
|
-
## pp values
|
44
|
-
recs << values
|
45
|
-
end
|
46
|
-
recs
|
47
|
-
end
|
48
|
-
|
49
|
-
|
50
|
-
end # class TableReader
|
51
|
-
|
52
|
-
end # module Factbook
|