factbook 1.1.1 → 2.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/{HISTORY.md → CHANGELOG.md} +3 -3
- data/Manifest.txt +1 -58
- data/README.md +50 -575
- data/Rakefile +29 -33
- data/lib/factbook.rb +8 -75
- metadata +20 -114
- data/data/attributes.yml +0 -337
- data/data/categories.csv +0 -164
- data/data/codes.csv +0 -262
- data/data/codesxref.csv +0 -280
- data/data/comparisons.csv +0 -75
- data/lib/factbook/almanac.rb +0 -72
- data/lib/factbook/attributes.rb +0 -74
- data/lib/factbook/builder.rb +0 -214
- data/lib/factbook/builder_item.rb +0 -92
- data/lib/factbook/builder_json.rb +0 -79
- data/lib/factbook/codes.rb +0 -119
- data/lib/factbook/comparisons.rb +0 -50
- data/lib/factbook/counter.rb +0 -48
- data/lib/factbook/db/importer.rb +0 -92
- data/lib/factbook/db/models.rb +0 -11
- data/lib/factbook/db/schema.rb +0 -36
- data/lib/factbook/normalize.rb +0 -43
- data/lib/factbook/page.rb +0 -185
- data/lib/factbook/page_info.rb +0 -12
- data/lib/factbook/reader_json.rb +0 -51
- data/lib/factbook/sanitizer.rb +0 -207
- data/lib/factbook/sect.rb +0 -29
- data/lib/factbook/subsect.rb +0 -18
- data/lib/factbook/table.rb +0 -52
- data/lib/factbook/utils.rb +0 -85
- data/lib/factbook/utils_info.rb +0 -102
- data/lib/factbook/version.rb +0 -22
- data/script/almanac.rb +0 -48
- data/script/attributes.rb +0 -34
- data/script/build.rb +0 -28
- data/script/counter.rb +0 -145
- data/script/json.rb +0 -18
- data/script/testbr.rb +0 -33
- data/script/testcodes.rb +0 -11
- data/test/data/au.html +0 -579
- data/test/data/au.yml +0 -8
- data/test/data/be.html +0 -596
- data/test/data/be.yml +0 -8
- data/test/data/json/au.json +0 -892
- data/test/data/src/au.html +0 -2006
- data/test/data/src/be.html +0 -2011
- data/test/helper.rb +0 -11
- data/test/test_attribs.rb +0 -82
- data/test/test_attribs_def.rb +0 -20
- data/test/test_builder.rb +0 -35
- data/test/test_codes.rb +0 -76
- data/test/test_comparisons.rb +0 -19
- data/test/test_convert.rb +0 -30
- data/test/test_counter.rb +0 -31
- data/test/test_fields.rb +0 -52
- data/test/test_importer.rb +0 -55
- data/test/test_item_builder.rb +0 -99
- data/test/test_json.rb +0 -44
- data/test/test_json_builder.rb +0 -25
- data/test/test_normalize.rb +0 -23
- data/test/test_page.rb +0 -38
- data/test/test_sanitizer.rb +0 -35
data/lib/factbook/page.rb
DELETED
@@ -1,185 +0,0 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
|
-
module Factbook
|
4
|
-
|
5
|
-
|
6
|
-
## note:
|
7
|
-
## some factbook pages with chrome (headers, footers, etc.)
|
8
|
-
## are NOT valid utf-8, thus,
|
9
|
-
## treat page as is (e.g. ASCII8BIT)
|
10
|
-
#
|
11
|
-
# only convert to utf8 when header and footer got stripped
|
12
|
-
|
13
|
-
##
|
14
|
-
## be/benin:
|
15
|
-
## Key Force or FC [Lazare S?xx?HOU?xx?TO] -- two invalid byte code chars in Political parties and leaders:
|
16
|
-
#
|
17
|
-
## in Western/Windows-1252 leads to FC [Lazare SÈHOUÉTO];
|
18
|
-
# Lazare Sèhouéto
|
19
|
-
#
|
20
|
-
# looks good - use (assume) Windows-1252 ????
|
21
|
-
|
22
|
-
##
|
23
|
-
# check for is ascii 7-bit ??? if yes -noworries
|
24
|
-
# if not, log number of chars not using ascii 7-bit
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
class Page
|
29
|
-
include LogUtils::Logging
|
30
|
-
|
31
|
-
attr_reader :sects ## "structured" access e.g. sects/subsects/etc.
|
32
|
-
attr_reader :info ## meta info e.g. country_code, country_name, region_name, last_updated, etc.
|
33
|
-
attr_reader :data ## "plain" access with vanilla hash
|
34
|
-
|
35
|
-
|
36
|
-
## standard version (note: requires https)
|
37
|
-
SITE_BASE = 'https://www.cia.gov/library/publications/the-world-factbook/geos/{code}.html'
|
38
|
-
|
39
|
-
def initialize( code, opts={} )
|
40
|
-
### keep code - why? why not?? (use page_info/info e.g. info.country_code??)
|
41
|
-
|
42
|
-
if opts[:json]
|
43
|
-
json = opts[:json] ## note: json is (still) a string/text (NOT yet parsed to structured data)
|
44
|
-
b = JsonBuilder.from_string( json )
|
45
|
-
else ## assume html
|
46
|
-
if opts[:html] ## note: expects ASCII-7BIT/BINARY encoding
|
47
|
-
## for debugging and testing allow "custom" passed-in html page
|
48
|
-
html = opts[:html]
|
49
|
-
else
|
50
|
-
url_string = SITE_BASE.gsub( '{code}', code )
|
51
|
-
## note: expects ASCII-7BIT/BINARY encoding
|
52
|
-
html = fetch_page( url_string ) ## use PageFetcher class - why?? why not??
|
53
|
-
end
|
54
|
-
b = Builder.from_string( html )
|
55
|
-
end
|
56
|
-
|
57
|
-
@sects = b.sects
|
58
|
-
@info = b.info
|
59
|
-
|
60
|
-
## todo/fix/quick hack:
|
61
|
-
## check for info opts hash entry - lets you overwrite page info
|
62
|
-
## -- use proper header to setup page info - why, why not??
|
63
|
-
if opts[:info]
|
64
|
-
info = opts[:info]
|
65
|
-
@info = info
|
66
|
-
end
|
67
|
-
|
68
|
-
@data = {}
|
69
|
-
@sects.each do |sect|
|
70
|
-
@data[ sect.title ] = sect.data
|
71
|
-
end
|
72
|
-
|
73
|
-
self ## return self (check - not needed??)
|
74
|
-
end
|
75
|
-
|
76
|
-
|
77
|
-
def to_json( opts={} ) ## convenience helper for data.to_json; note: pretty print by default!
|
78
|
-
if opts[:minify]
|
79
|
-
data.to_json
|
80
|
-
else
|
81
|
-
## was: -- opts[:pretty] || opts[:pp]
|
82
|
-
JSON.pretty_generate( data ) ## note: pretty print by default!
|
83
|
-
end
|
84
|
-
end
|
85
|
-
|
86
|
-
|
87
|
-
def [](key) ### convenience shortcut
|
88
|
-
# lets you use
|
89
|
-
# page['geo']
|
90
|
-
# instead of
|
91
|
-
# page.data['geo']
|
92
|
-
|
93
|
-
## fix: use delegate data, [] from forwardable lib - why?? why not??
|
94
|
-
|
95
|
-
data[key]
|
96
|
-
end
|
97
|
-
|
98
|
-
## add convenience (shortcut) accessors / attributes / fields / getters
|
99
|
-
|
100
|
-
ATTRIBUTES.each do |attrib|
|
101
|
-
## e.g.
|
102
|
-
## def background() data['Introduction']['Background']['text']; end
|
103
|
-
## def location() data['Geography']['Location']['text']; end
|
104
|
-
## etc.
|
105
|
-
if attrib.path.size == 1
|
106
|
-
define_method attrib.name.to_sym do
|
107
|
-
@data.fetch( attrib.category, {} ).
|
108
|
-
fetch( attrib.path[0], {} )['text']
|
109
|
-
end
|
110
|
-
else ## assume size 2 for now
|
111
|
-
define_method attrib.name.to_sym do
|
112
|
-
@data.fetch( attrib.category, {} ).
|
113
|
-
fetch( attrib.path[0], {} ).
|
114
|
-
fetch( attrib.path[1], {} )['text']
|
115
|
-
end
|
116
|
-
end
|
117
|
-
end
|
118
|
-
|
119
|
-
|
120
|
-
private
|
121
|
-
def fetch_page( url_string )
|
122
|
-
|
123
|
-
worker = Fetcher::Worker.new
|
124
|
-
response = worker.get_response( url_string )
|
125
|
-
|
126
|
-
if response.code == '200'
|
127
|
-
t = response.body
|
128
|
-
###
|
129
|
-
# NB: Net::HTTP will NOT set encoding UTF-8 etc.
|
130
|
-
# will mostly be ASCII
|
131
|
-
# - try to change encoding to UTF-8 ourselves
|
132
|
-
logger.debug "t.encoding.name (before): #{t.encoding.name}"
|
133
|
-
#####
|
134
|
-
# NB: ASCII-8BIT == BINARY == Encoding Unknown; Raw Bytes Here
|
135
|
-
t
|
136
|
-
else
|
137
|
-
logger.error "fetch HTTP - #{response.code} #{response.message}"
|
138
|
-
## todo/fix: raise http exception (see fetcher) -- why? why not??
|
139
|
-
fail "fetch HTTP - #{response.code} #{response.message}"
|
140
|
-
nil
|
141
|
-
end
|
142
|
-
end
|
143
|
-
|
144
|
-
|
145
|
-
=begin
|
146
|
-
def self.from_url( cc, cn )
|
147
|
-
html_ascii = PageFetcher.new.fetch( cc )
|
148
|
-
self.new( cc, cn, html_ascii )
|
149
|
-
end
|
150
|
-
|
151
|
-
def self.from_file( cc, cn, opts={} )
|
152
|
-
input_dir = opts[:input_dir] || '.'
|
153
|
-
html_ascii = File.read( "#{input_dir}/#{cc}.html" ) ## fix/todo: use ASCII8BIT/binary reader
|
154
|
-
self.new( cc, cn, html_ascii )
|
155
|
-
end
|
156
|
-
=end
|
157
|
-
|
158
|
-
|
159
|
-
end # class Page
|
160
|
-
|
161
|
-
|
162
|
-
=begin
|
163
|
-
class PageFetcher
|
164
|
-
|
165
|
-
def fetch( cc )
|
166
|
-
worker = Fetcher::Worker.new
|
167
|
-
factbook_base = 'https://www.cia.gov/library/publications/the-world-factbook/geos'
|
168
|
-
|
169
|
-
res = worker.get_response( "#{factbook_base}/#{cc}.html" )
|
170
|
-
|
171
|
-
# on error throw exception - why? why not??
|
172
|
-
if res.code != '200'
|
173
|
-
raise Fetcher::HttpError.new( res.code, res.message )
|
174
|
-
end
|
175
|
-
|
176
|
-
###
|
177
|
-
# Note: Net::HTTP will NOT set encoding UTF-8 etc.
|
178
|
-
# will be set to ASCII-8BIT == BINARY == Encoding Unknown; Raw Bytes Here
|
179
|
-
html = res.body.to_s
|
180
|
-
end
|
181
|
-
end # PageFetcher
|
182
|
-
=end
|
183
|
-
|
184
|
-
|
185
|
-
end # module Factbook
|
data/lib/factbook/page_info.rb
DELETED
data/lib/factbook/reader_json.rb
DELETED
@@ -1,51 +0,0 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
|
-
module Factbook
|
4
|
-
|
5
|
-
|
6
|
-
class JsonPageReader
|
7
|
-
def initialize( json_dir )
|
8
|
-
@json_dir = json_dir
|
9
|
-
end
|
10
|
-
|
11
|
-
def read_page( code )
|
12
|
-
path = "#{@json_dir}/#{region_to_slug(code.region)}/#{code.code}.json"
|
13
|
-
|
14
|
-
puts "reading #{code.code} #{code.name} (#{code.region}) [#{path}]..."
|
15
|
-
json = File.read( path )
|
16
|
-
|
17
|
-
## todo/fix/quick hack: for now until we have a proper header/meta/info section in json
|
18
|
-
# add some page info from code struct
|
19
|
-
|
20
|
-
info = PageInfo.new
|
21
|
-
info.country_code = code.code
|
22
|
-
info.country_name = code.name
|
23
|
-
info.region_name = code.region
|
24
|
-
|
25
|
-
page = Page.new( code.code, json: json, info: info )
|
26
|
-
page
|
27
|
-
end
|
28
|
-
|
29
|
-
def read_pages( codes, limit: nil )
|
30
|
-
pages = []
|
31
|
-
i=0
|
32
|
-
codes.each do |code|
|
33
|
-
next if limit && i > limit ## for debugging just process first x entries
|
34
|
-
|
35
|
-
pages << read_page( code )
|
36
|
-
end
|
37
|
-
pages
|
38
|
-
end
|
39
|
-
|
40
|
-
private
|
41
|
-
def region_to_slug( text )
|
42
|
-
## change and => n
|
43
|
-
## change & => n
|
44
|
-
## change all spaces to => -
|
45
|
-
## e.g. East & Southeast Asia => east-n-southeast-asia
|
46
|
-
## Central America and Caribbean => central-america-n-caribbean
|
47
|
-
text.downcase.gsub('and', 'n').gsub( '&', 'n' ).gsub( ' ', '-' )
|
48
|
-
end
|
49
|
-
end ## JsonPageReader
|
50
|
-
|
51
|
-
end # module Factbook
|
data/lib/factbook/sanitizer.rb
DELETED
@@ -1,207 +0,0 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
|
-
module Factbook
|
4
|
-
|
5
|
-
class Sanitizer
|
6
|
-
include LogUtils::Logging
|
7
|
-
include Utils ## pulls in encode_utf8, ...
|
8
|
-
|
9
|
-
|
10
|
-
def sanitize( html_ascii )
|
11
|
-
## todo: add option for (html source) encoding - why?? why not??
|
12
|
-
|
13
|
-
## note:
|
14
|
-
## returns 1) html profile withouth headers, footers, scripts,etc.
|
15
|
-
## 2) page (meta) info e.g. country_name, country_code, last_updated, etc.
|
16
|
-
## 3) errors e.g. list of errors e.g. endcoding errors (invalid byte sequence etc.)
|
17
|
-
|
18
|
-
page_info = PageInfo.new
|
19
|
-
|
20
|
-
h = find_page_info( html_ascii )
|
21
|
-
page_info.country_code = h[:country_code]
|
22
|
-
page_info.country_name = h[:country_name]
|
23
|
-
page_info.country_affiliation = h[:country_affiliation]
|
24
|
-
page_info.region_code = h[:region_code]
|
25
|
-
page_info.region_name = h[:region_name]
|
26
|
-
|
27
|
-
page_info.last_updated = find_page_last_updated( html_ascii )
|
28
|
-
|
29
|
-
|
30
|
-
html_profile_ascii = find_country_profile( html_ascii ) ## cut-off headers, footers, scripts, etc.
|
31
|
-
|
32
|
-
## todo/fix: assume windows 12xx encoding!!!! for factbook - try
|
33
|
-
html, errors = encode_utf8( html_profile_ascii ) ## change encoding to utf-8 (from binary/ascii8bit)
|
34
|
-
|
35
|
-
html = sanitize_profile( html )
|
36
|
-
|
37
|
-
[html, page_info, errors]
|
38
|
-
end
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
BEGIN_FACTS_REGEX = /<ul\s+
|
43
|
-
class="expandcollapse">
|
44
|
-
/xim ## ignore case; multi-line
|
45
|
-
|
46
|
-
END_FACTS_REGEX = /<\/li>\s*
|
47
|
-
<\/ul>\s*
|
48
|
-
<\/tbody>\s*
|
49
|
-
<\/table>
|
50
|
-
/xim ## ignore case; multi-line
|
51
|
-
|
52
|
-
|
53
|
-
def find_country_profile( html )
|
54
|
-
####
|
55
|
-
## remove header (everything before)
|
56
|
-
## <ul class="expandcollapse">
|
57
|
-
|
58
|
-
pos = html.index( BEGIN_FACTS_REGEX )
|
59
|
-
fail "*** no begin facts marker found for page" if pos.nil?
|
60
|
-
|
61
|
-
puts " bingo - found BEGIN_FACTS on pos #{pos}"
|
62
|
-
html = html[pos..-1]
|
63
|
-
|
64
|
-
pp html[0..100]
|
65
|
-
|
66
|
-
###
|
67
|
-
## remove footer
|
68
|
-
## assume everthings after (last list item in unorder list inside a table body)
|
69
|
-
## </li>
|
70
|
-
## </ul>
|
71
|
-
## </tbody></table>
|
72
|
-
|
73
|
-
pos = html.index( END_FACTS_REGEX )
|
74
|
-
fail "*** no end facts marker found for page" if pos.nil?
|
75
|
-
|
76
|
-
puts " bingo - found END_FACTS on pos #{pos}"
|
77
|
-
html = html[0...pos] + "</li></ul>\n" ## note: use ... (not .. to cut-off pos)
|
78
|
-
|
79
|
-
pp html[-200..-1]
|
80
|
-
html
|
81
|
-
end
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
STYLE_ATTR_REGEX = /\s*
|
86
|
-
style=('|").+?\1 ## note: use non-greedy match e.g. .+?
|
87
|
-
/xim ## do NOT allow multi-line - why? why not?
|
88
|
-
|
89
|
-
CLASS_ATTR_REGEX = /\s*
|
90
|
-
class=('|")(.+?)\1 ## note: use non-greedy match e.g. .+?
|
91
|
-
/xim ## do NOT allow multi-line - why? why not?
|
92
|
-
|
93
|
-
##
|
94
|
-
## <div>
|
95
|
-
## <span class='category'>country comparison to the world: </span>
|
96
|
-
## <span class='category_data'>[[191]]</span>
|
97
|
-
## </div>
|
98
|
-
##
|
99
|
-
## <span class='category'>country comparison to the world: </span>
|
100
|
-
## <span class='category_data'><a href='../rankorder/2147rank.html#au'>114</a></span>
|
101
|
-
|
102
|
-
|
103
|
-
## todo: add enclosing div too!!!
|
104
|
-
|
105
|
-
COUNTRY_COMPARISON_REGEX = /
|
106
|
-
<div>
|
107
|
-
<span \s class='category'[^>]*>
|
108
|
-
country \s comparison \s to \s the \s world: \s*
|
109
|
-
<\/span>
|
110
|
-
\s*
|
111
|
-
<span \s class='category_data'[^>]*>
|
112
|
-
\s*
|
113
|
-
<a \s [^>]+>
|
114
|
-
.+?
|
115
|
-
<\/a>
|
116
|
-
\s*
|
117
|
-
<\/span>
|
118
|
-
<\/div>
|
119
|
-
/xim
|
120
|
-
|
121
|
-
##
|
122
|
-
## <div class='wrap'>
|
123
|
-
## <div class='audio-player'>
|
124
|
-
## <audio id='audio-player-1' class='my-audio-player' src='../anthems/AU.mp3' type='audio/mp3' controls='controls'>
|
125
|
-
## </audio>
|
126
|
-
## </div></div>
|
127
|
-
|
128
|
-
|
129
|
-
AUDIO_PLAYER_REGEX = /
|
130
|
-
<div \s class='wrap'>
|
131
|
-
<div \s class='audio-player'>
|
132
|
-
<audio \s [^>]+>
|
133
|
-
<\/audio>
|
134
|
-
<\/div>
|
135
|
-
<\/div>
|
136
|
-
/xim
|
137
|
-
|
138
|
-
def sanitize_profile( html )
|
139
|
-
|
140
|
-
html = html.gsub( STYLE_ATTR_REGEX ) do |m|
|
141
|
-
puts "remove style attr:"
|
142
|
-
puts "#{m}"
|
143
|
-
''
|
144
|
-
end
|
145
|
-
|
146
|
-
html = html.gsub( AUDIO_PLAYER_REGEX ) do |m|
|
147
|
-
puts "remove audio player:"
|
148
|
-
puts "#{m}"
|
149
|
-
''
|
150
|
-
end
|
151
|
-
|
152
|
-
|
153
|
-
html = html.gsub( COUNTRY_COMPARISON_REGEX ) do |m|
|
154
|
-
puts "remove country comparison:"
|
155
|
-
puts "#{m}"
|
156
|
-
''
|
157
|
-
end
|
158
|
-
|
159
|
-
## remove/cleanup anchors (a href)
|
160
|
-
html = html.gsub( /<a\s+href[^>]*>(.+?)<\/a>/im ) do |_| ## note: use .+? non-greedy match
|
161
|
-
puts " replace anchor (a) href >#{$1}<"
|
162
|
-
|
163
|
-
inner_text = $1.dup ## keep a copy
|
164
|
-
if inner_text =~ /<img/ ## if includes image remove
|
165
|
-
puts " remove image in anchor"
|
166
|
-
''
|
167
|
-
else ## keep inner text
|
168
|
-
inner_text
|
169
|
-
end
|
170
|
-
end
|
171
|
-
|
172
|
-
|
173
|
-
## remove all list e.g. ul/li
|
174
|
-
html = html.gsub( /<\/?(li|ul)[^>]*>/im ) do |m|
|
175
|
-
puts " remove list >#{m}<"
|
176
|
-
''
|
177
|
-
end
|
178
|
-
|
179
|
-
## clean-up class attrib e.g. remove unknown classes
|
180
|
-
html = html.gsub( CLASS_ATTR_REGEX ) do |m|
|
181
|
-
puts "cleanup class attr:"
|
182
|
-
puts "#{m}"
|
183
|
-
|
184
|
-
klasses = $2.split(' ')
|
185
|
-
klasses = klasses.select do |klass|
|
186
|
-
if ['region', 'category', 'category_data'].include?( klass )
|
187
|
-
true
|
188
|
-
else
|
189
|
-
puts " remove class #{klass}"
|
190
|
-
false
|
191
|
-
end
|
192
|
-
end
|
193
|
-
|
194
|
-
if klasses.size > 0
|
195
|
-
" class='#{klasses.join(' ')}'" ## note: add leading space!!
|
196
|
-
else
|
197
|
-
'' ## remove class attrib completely
|
198
|
-
end
|
199
|
-
end
|
200
|
-
|
201
|
-
html
|
202
|
-
end
|
203
|
-
|
204
|
-
|
205
|
-
end # class Sanitizer
|
206
|
-
|
207
|
-
end # module Factbook
|