factbook 1.1.1 → 2.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/{HISTORY.md → CHANGELOG.md} +3 -3
- data/Manifest.txt +1 -58
- data/README.md +50 -575
- data/Rakefile +29 -33
- data/lib/factbook.rb +8 -75
- metadata +20 -114
- data/data/attributes.yml +0 -337
- data/data/categories.csv +0 -164
- data/data/codes.csv +0 -262
- data/data/codesxref.csv +0 -280
- data/data/comparisons.csv +0 -75
- data/lib/factbook/almanac.rb +0 -72
- data/lib/factbook/attributes.rb +0 -74
- data/lib/factbook/builder.rb +0 -214
- data/lib/factbook/builder_item.rb +0 -92
- data/lib/factbook/builder_json.rb +0 -79
- data/lib/factbook/codes.rb +0 -119
- data/lib/factbook/comparisons.rb +0 -50
- data/lib/factbook/counter.rb +0 -48
- data/lib/factbook/db/importer.rb +0 -92
- data/lib/factbook/db/models.rb +0 -11
- data/lib/factbook/db/schema.rb +0 -36
- data/lib/factbook/normalize.rb +0 -43
- data/lib/factbook/page.rb +0 -185
- data/lib/factbook/page_info.rb +0 -12
- data/lib/factbook/reader_json.rb +0 -51
- data/lib/factbook/sanitizer.rb +0 -207
- data/lib/factbook/sect.rb +0 -29
- data/lib/factbook/subsect.rb +0 -18
- data/lib/factbook/table.rb +0 -52
- data/lib/factbook/utils.rb +0 -85
- data/lib/factbook/utils_info.rb +0 -102
- data/lib/factbook/version.rb +0 -22
- data/script/almanac.rb +0 -48
- data/script/attributes.rb +0 -34
- data/script/build.rb +0 -28
- data/script/counter.rb +0 -145
- data/script/json.rb +0 -18
- data/script/testbr.rb +0 -33
- data/script/testcodes.rb +0 -11
- data/test/data/au.html +0 -579
- data/test/data/au.yml +0 -8
- data/test/data/be.html +0 -596
- data/test/data/be.yml +0 -8
- data/test/data/json/au.json +0 -892
- data/test/data/src/au.html +0 -2006
- data/test/data/src/be.html +0 -2011
- data/test/helper.rb +0 -11
- data/test/test_attribs.rb +0 -82
- data/test/test_attribs_def.rb +0 -20
- data/test/test_builder.rb +0 -35
- data/test/test_codes.rb +0 -76
- data/test/test_comparisons.rb +0 -19
- data/test/test_convert.rb +0 -30
- data/test/test_counter.rb +0 -31
- data/test/test_fields.rb +0 -52
- data/test/test_importer.rb +0 -55
- data/test/test_item_builder.rb +0 -99
- data/test/test_json.rb +0 -44
- data/test/test_json_builder.rb +0 -25
- data/test/test_normalize.rb +0 -23
- data/test/test_page.rb +0 -38
- data/test/test_sanitizer.rb +0 -35
data/lib/factbook/page.rb
DELETED
@@ -1,185 +0,0 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
|
-
module Factbook
|
4
|
-
|
5
|
-
|
6
|
-
## note:
|
7
|
-
## some factbook pages with chrome (headers, footers, etc.)
|
8
|
-
## are NOT valid utf-8, thus,
|
9
|
-
## treat page as is (e.g. ASCII8BIT)
|
10
|
-
#
|
11
|
-
# only convert to utf8 when header and footer got stripped
|
12
|
-
|
13
|
-
##
|
14
|
-
## be/benin:
|
15
|
-
## Key Force or FC [Lazare S?xx?HOU?xx?TO] -- two invalid byte code chars in Political parties and leaders:
|
16
|
-
#
|
17
|
-
## in Western/Windows-1252 leads to FC [Lazare SÈHOUÉTO];
|
18
|
-
# Lazare Sèhouéto
|
19
|
-
#
|
20
|
-
# looks good - use (assume) Windows-1252 ????
|
21
|
-
|
22
|
-
##
|
23
|
-
# check for is ascii 7-bit ??? if yes -noworries
|
24
|
-
# if not, log number of chars not using ascii 7-bit
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
class Page
|
29
|
-
include LogUtils::Logging
|
30
|
-
|
31
|
-
attr_reader :sects ## "structured" access e.g. sects/subsects/etc.
|
32
|
-
attr_reader :info ## meta info e.g. country_code, country_name, region_name, last_updated, etc.
|
33
|
-
attr_reader :data ## "plain" access with vanilla hash
|
34
|
-
|
35
|
-
|
36
|
-
## standard version (note: requires https)
|
37
|
-
SITE_BASE = 'https://www.cia.gov/library/publications/the-world-factbook/geos/{code}.html'
|
38
|
-
|
39
|
-
def initialize( code, opts={} )
|
40
|
-
### keep code - why? why not?? (use page_info/info e.g. info.country_code??)
|
41
|
-
|
42
|
-
if opts[:json]
|
43
|
-
json = opts[:json] ## note: json is (still) a string/text (NOT yet parsed to structured data)
|
44
|
-
b = JsonBuilder.from_string( json )
|
45
|
-
else ## assume html
|
46
|
-
if opts[:html] ## note: expects ASCII-7BIT/BINARY encoding
|
47
|
-
## for debugging and testing allow "custom" passed-in html page
|
48
|
-
html = opts[:html]
|
49
|
-
else
|
50
|
-
url_string = SITE_BASE.gsub( '{code}', code )
|
51
|
-
## note: expects ASCII-7BIT/BINARY encoding
|
52
|
-
html = fetch_page( url_string ) ## use PageFetcher class - why?? why not??
|
53
|
-
end
|
54
|
-
b = Builder.from_string( html )
|
55
|
-
end
|
56
|
-
|
57
|
-
@sects = b.sects
|
58
|
-
@info = b.info
|
59
|
-
|
60
|
-
## todo/fix/quick hack:
|
61
|
-
## check for info opts hash entry - lets you overwrite page info
|
62
|
-
## -- use proper header to setup page info - why, why not??
|
63
|
-
if opts[:info]
|
64
|
-
info = opts[:info]
|
65
|
-
@info = info
|
66
|
-
end
|
67
|
-
|
68
|
-
@data = {}
|
69
|
-
@sects.each do |sect|
|
70
|
-
@data[ sect.title ] = sect.data
|
71
|
-
end
|
72
|
-
|
73
|
-
self ## return self (check - not needed??)
|
74
|
-
end
|
75
|
-
|
76
|
-
|
77
|
-
def to_json( opts={} ) ## convenience helper for data.to_json; note: pretty print by default!
|
78
|
-
if opts[:minify]
|
79
|
-
data.to_json
|
80
|
-
else
|
81
|
-
## was: -- opts[:pretty] || opts[:pp]
|
82
|
-
JSON.pretty_generate( data ) ## note: pretty print by default!
|
83
|
-
end
|
84
|
-
end
|
85
|
-
|
86
|
-
|
87
|
-
def [](key) ### convenience shortcut
|
88
|
-
# lets you use
|
89
|
-
# page['geo']
|
90
|
-
# instead of
|
91
|
-
# page.data['geo']
|
92
|
-
|
93
|
-
## fix: use delegate data, [] from forwardable lib - why?? why not??
|
94
|
-
|
95
|
-
data[key]
|
96
|
-
end
|
97
|
-
|
98
|
-
## add convenience (shortcut) accessors / attributes / fields / getters
|
99
|
-
|
100
|
-
ATTRIBUTES.each do |attrib|
|
101
|
-
## e.g.
|
102
|
-
## def background() data['Introduction']['Background']['text']; end
|
103
|
-
## def location() data['Geography']['Location']['text']; end
|
104
|
-
## etc.
|
105
|
-
if attrib.path.size == 1
|
106
|
-
define_method attrib.name.to_sym do
|
107
|
-
@data.fetch( attrib.category, {} ).
|
108
|
-
fetch( attrib.path[0], {} )['text']
|
109
|
-
end
|
110
|
-
else ## assume size 2 for now
|
111
|
-
define_method attrib.name.to_sym do
|
112
|
-
@data.fetch( attrib.category, {} ).
|
113
|
-
fetch( attrib.path[0], {} ).
|
114
|
-
fetch( attrib.path[1], {} )['text']
|
115
|
-
end
|
116
|
-
end
|
117
|
-
end
|
118
|
-
|
119
|
-
|
120
|
-
private
|
121
|
-
def fetch_page( url_string )
|
122
|
-
|
123
|
-
worker = Fetcher::Worker.new
|
124
|
-
response = worker.get_response( url_string )
|
125
|
-
|
126
|
-
if response.code == '200'
|
127
|
-
t = response.body
|
128
|
-
###
|
129
|
-
# NB: Net::HTTP will NOT set encoding UTF-8 etc.
|
130
|
-
# will mostly be ASCII
|
131
|
-
# - try to change encoding to UTF-8 ourselves
|
132
|
-
logger.debug "t.encoding.name (before): #{t.encoding.name}"
|
133
|
-
#####
|
134
|
-
# NB: ASCII-8BIT == BINARY == Encoding Unknown; Raw Bytes Here
|
135
|
-
t
|
136
|
-
else
|
137
|
-
logger.error "fetch HTTP - #{response.code} #{response.message}"
|
138
|
-
## todo/fix: raise http exception (see fetcher) -- why? why not??
|
139
|
-
fail "fetch HTTP - #{response.code} #{response.message}"
|
140
|
-
nil
|
141
|
-
end
|
142
|
-
end
|
143
|
-
|
144
|
-
|
145
|
-
=begin
|
146
|
-
def self.from_url( cc, cn )
|
147
|
-
html_ascii = PageFetcher.new.fetch( cc )
|
148
|
-
self.new( cc, cn, html_ascii )
|
149
|
-
end
|
150
|
-
|
151
|
-
def self.from_file( cc, cn, opts={} )
|
152
|
-
input_dir = opts[:input_dir] || '.'
|
153
|
-
html_ascii = File.read( "#{input_dir}/#{cc}.html" ) ## fix/todo: use ASCII8BIT/binary reader
|
154
|
-
self.new( cc, cn, html_ascii )
|
155
|
-
end
|
156
|
-
=end
|
157
|
-
|
158
|
-
|
159
|
-
end # class Page
|
160
|
-
|
161
|
-
|
162
|
-
=begin
|
163
|
-
class PageFetcher
|
164
|
-
|
165
|
-
def fetch( cc )
|
166
|
-
worker = Fetcher::Worker.new
|
167
|
-
factbook_base = 'https://www.cia.gov/library/publications/the-world-factbook/geos'
|
168
|
-
|
169
|
-
res = worker.get_response( "#{factbook_base}/#{cc}.html" )
|
170
|
-
|
171
|
-
# on error throw exception - why? why not??
|
172
|
-
if res.code != '200'
|
173
|
-
raise Fetcher::HttpError.new( res.code, res.message )
|
174
|
-
end
|
175
|
-
|
176
|
-
###
|
177
|
-
# Note: Net::HTTP will NOT set encoding UTF-8 etc.
|
178
|
-
# will be set to ASCII-8BIT == BINARY == Encoding Unknown; Raw Bytes Here
|
179
|
-
html = res.body.to_s
|
180
|
-
end
|
181
|
-
end # PageFetcher
|
182
|
-
=end
|
183
|
-
|
184
|
-
|
185
|
-
end # module Factbook
|
data/lib/factbook/page_info.rb
DELETED
data/lib/factbook/reader_json.rb
DELETED
@@ -1,51 +0,0 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
|
-
module Factbook
|
4
|
-
|
5
|
-
|
6
|
-
class JsonPageReader
|
7
|
-
def initialize( json_dir )
|
8
|
-
@json_dir = json_dir
|
9
|
-
end
|
10
|
-
|
11
|
-
def read_page( code )
|
12
|
-
path = "#{@json_dir}/#{region_to_slug(code.region)}/#{code.code}.json"
|
13
|
-
|
14
|
-
puts "reading #{code.code} #{code.name} (#{code.region}) [#{path}]..."
|
15
|
-
json = File.read( path )
|
16
|
-
|
17
|
-
## todo/fix/quick hack: for now until we have a proper header/meta/info section in json
|
18
|
-
# add some page info from code struct
|
19
|
-
|
20
|
-
info = PageInfo.new
|
21
|
-
info.country_code = code.code
|
22
|
-
info.country_name = code.name
|
23
|
-
info.region_name = code.region
|
24
|
-
|
25
|
-
page = Page.new( code.code, json: json, info: info )
|
26
|
-
page
|
27
|
-
end
|
28
|
-
|
29
|
-
def read_pages( codes, limit: nil )
|
30
|
-
pages = []
|
31
|
-
i=0
|
32
|
-
codes.each do |code|
|
33
|
-
next if limit && i > limit ## for debugging just process first x entries
|
34
|
-
|
35
|
-
pages << read_page( code )
|
36
|
-
end
|
37
|
-
pages
|
38
|
-
end
|
39
|
-
|
40
|
-
private
|
41
|
-
def region_to_slug( text )
|
42
|
-
## change and => n
|
43
|
-
## change & => n
|
44
|
-
## change all spaces to => -
|
45
|
-
## e.g. East & Southeast Asia => east-n-southeast-asia
|
46
|
-
## Central America and Caribbean => central-america-n-caribbean
|
47
|
-
text.downcase.gsub('and', 'n').gsub( '&', 'n' ).gsub( ' ', '-' )
|
48
|
-
end
|
49
|
-
end ## JsonPageReader
|
50
|
-
|
51
|
-
end # module Factbook
|
data/lib/factbook/sanitizer.rb
DELETED
@@ -1,207 +0,0 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
|
-
module Factbook
|
4
|
-
|
5
|
-
class Sanitizer
|
6
|
-
include LogUtils::Logging
|
7
|
-
include Utils ## pulls in encode_utf8, ...
|
8
|
-
|
9
|
-
|
10
|
-
def sanitize( html_ascii )
|
11
|
-
## todo: add option for (html source) encoding - why?? why not??
|
12
|
-
|
13
|
-
## note:
|
14
|
-
## returns 1) html profile withouth headers, footers, scripts,etc.
|
15
|
-
## 2) page (meta) info e.g. country_name, country_code, last_updated, etc.
|
16
|
-
## 3) errors e.g. list of errors e.g. endcoding errors (invalid byte sequence etc.)
|
17
|
-
|
18
|
-
page_info = PageInfo.new
|
19
|
-
|
20
|
-
h = find_page_info( html_ascii )
|
21
|
-
page_info.country_code = h[:country_code]
|
22
|
-
page_info.country_name = h[:country_name]
|
23
|
-
page_info.country_affiliation = h[:country_affiliation]
|
24
|
-
page_info.region_code = h[:region_code]
|
25
|
-
page_info.region_name = h[:region_name]
|
26
|
-
|
27
|
-
page_info.last_updated = find_page_last_updated( html_ascii )
|
28
|
-
|
29
|
-
|
30
|
-
html_profile_ascii = find_country_profile( html_ascii ) ## cut-off headers, footers, scripts, etc.
|
31
|
-
|
32
|
-
## todo/fix: assume windows 12xx encoding!!!! for factbook - try
|
33
|
-
html, errors = encode_utf8( html_profile_ascii ) ## change encoding to utf-8 (from binary/ascii8bit)
|
34
|
-
|
35
|
-
html = sanitize_profile( html )
|
36
|
-
|
37
|
-
[html, page_info, errors]
|
38
|
-
end
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
BEGIN_FACTS_REGEX = /<ul\s+
|
43
|
-
class="expandcollapse">
|
44
|
-
/xim ## ignore case; multi-line
|
45
|
-
|
46
|
-
END_FACTS_REGEX = /<\/li>\s*
|
47
|
-
<\/ul>\s*
|
48
|
-
<\/tbody>\s*
|
49
|
-
<\/table>
|
50
|
-
/xim ## ignore case; multi-line
|
51
|
-
|
52
|
-
|
53
|
-
def find_country_profile( html )
|
54
|
-
####
|
55
|
-
## remove header (everything before)
|
56
|
-
## <ul class="expandcollapse">
|
57
|
-
|
58
|
-
pos = html.index( BEGIN_FACTS_REGEX )
|
59
|
-
fail "*** no begin facts marker found for page" if pos.nil?
|
60
|
-
|
61
|
-
puts " bingo - found BEGIN_FACTS on pos #{pos}"
|
62
|
-
html = html[pos..-1]
|
63
|
-
|
64
|
-
pp html[0..100]
|
65
|
-
|
66
|
-
###
|
67
|
-
## remove footer
|
68
|
-
## assume everthings after (last list item in unorder list inside a table body)
|
69
|
-
## </li>
|
70
|
-
## </ul>
|
71
|
-
## </tbody></table>
|
72
|
-
|
73
|
-
pos = html.index( END_FACTS_REGEX )
|
74
|
-
fail "*** no end facts marker found for page" if pos.nil?
|
75
|
-
|
76
|
-
puts " bingo - found END_FACTS on pos #{pos}"
|
77
|
-
html = html[0...pos] + "</li></ul>\n" ## note: use ... (not .. to cut-off pos)
|
78
|
-
|
79
|
-
pp html[-200..-1]
|
80
|
-
html
|
81
|
-
end
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
STYLE_ATTR_REGEX = /\s*
|
86
|
-
style=('|").+?\1 ## note: use non-greedy match e.g. .+?
|
87
|
-
/xim ## do NOT allow multi-line - why? why not?
|
88
|
-
|
89
|
-
CLASS_ATTR_REGEX = /\s*
|
90
|
-
class=('|")(.+?)\1 ## note: use non-greedy match e.g. .+?
|
91
|
-
/xim ## do NOT allow multi-line - why? why not?
|
92
|
-
|
93
|
-
##
|
94
|
-
## <div>
|
95
|
-
## <span class='category'>country comparison to the world: </span>
|
96
|
-
## <span class='category_data'>[[191]]</span>
|
97
|
-
## </div>
|
98
|
-
##
|
99
|
-
## <span class='category'>country comparison to the world: </span>
|
100
|
-
## <span class='category_data'><a href='../rankorder/2147rank.html#au'>114</a></span>
|
101
|
-
|
102
|
-
|
103
|
-
## todo: add enclosing div too!!!
|
104
|
-
|
105
|
-
COUNTRY_COMPARISON_REGEX = /
|
106
|
-
<div>
|
107
|
-
<span \s class='category'[^>]*>
|
108
|
-
country \s comparison \s to \s the \s world: \s*
|
109
|
-
<\/span>
|
110
|
-
\s*
|
111
|
-
<span \s class='category_data'[^>]*>
|
112
|
-
\s*
|
113
|
-
<a \s [^>]+>
|
114
|
-
.+?
|
115
|
-
<\/a>
|
116
|
-
\s*
|
117
|
-
<\/span>
|
118
|
-
<\/div>
|
119
|
-
/xim
|
120
|
-
|
121
|
-
##
|
122
|
-
## <div class='wrap'>
|
123
|
-
## <div class='audio-player'>
|
124
|
-
## <audio id='audio-player-1' class='my-audio-player' src='../anthems/AU.mp3' type='audio/mp3' controls='controls'>
|
125
|
-
## </audio>
|
126
|
-
## </div></div>
|
127
|
-
|
128
|
-
|
129
|
-
AUDIO_PLAYER_REGEX = /
|
130
|
-
<div \s class='wrap'>
|
131
|
-
<div \s class='audio-player'>
|
132
|
-
<audio \s [^>]+>
|
133
|
-
<\/audio>
|
134
|
-
<\/div>
|
135
|
-
<\/div>
|
136
|
-
/xim
|
137
|
-
|
138
|
-
def sanitize_profile( html )
|
139
|
-
|
140
|
-
html = html.gsub( STYLE_ATTR_REGEX ) do |m|
|
141
|
-
puts "remove style attr:"
|
142
|
-
puts "#{m}"
|
143
|
-
''
|
144
|
-
end
|
145
|
-
|
146
|
-
html = html.gsub( AUDIO_PLAYER_REGEX ) do |m|
|
147
|
-
puts "remove audio player:"
|
148
|
-
puts "#{m}"
|
149
|
-
''
|
150
|
-
end
|
151
|
-
|
152
|
-
|
153
|
-
html = html.gsub( COUNTRY_COMPARISON_REGEX ) do |m|
|
154
|
-
puts "remove country comparison:"
|
155
|
-
puts "#{m}"
|
156
|
-
''
|
157
|
-
end
|
158
|
-
|
159
|
-
## remove/cleanup anchors (a href)
|
160
|
-
html = html.gsub( /<a\s+href[^>]*>(.+?)<\/a>/im ) do |_| ## note: use .+? non-greedy match
|
161
|
-
puts " replace anchor (a) href >#{$1}<"
|
162
|
-
|
163
|
-
inner_text = $1.dup ## keep a copy
|
164
|
-
if inner_text =~ /<img/ ## if includes image remove
|
165
|
-
puts " remove image in anchor"
|
166
|
-
''
|
167
|
-
else ## keep inner text
|
168
|
-
inner_text
|
169
|
-
end
|
170
|
-
end
|
171
|
-
|
172
|
-
|
173
|
-
## remove all list e.g. ul/li
|
174
|
-
html = html.gsub( /<\/?(li|ul)[^>]*>/im ) do |m|
|
175
|
-
puts " remove list >#{m}<"
|
176
|
-
''
|
177
|
-
end
|
178
|
-
|
179
|
-
## clean-up class attrib e.g. remove unknown classes
|
180
|
-
html = html.gsub( CLASS_ATTR_REGEX ) do |m|
|
181
|
-
puts "cleanup class attr:"
|
182
|
-
puts "#{m}"
|
183
|
-
|
184
|
-
klasses = $2.split(' ')
|
185
|
-
klasses = klasses.select do |klass|
|
186
|
-
if ['region', 'category', 'category_data'].include?( klass )
|
187
|
-
true
|
188
|
-
else
|
189
|
-
puts " remove class #{klass}"
|
190
|
-
false
|
191
|
-
end
|
192
|
-
end
|
193
|
-
|
194
|
-
if klasses.size > 0
|
195
|
-
" class='#{klasses.join(' ')}'" ## note: add leading space!!
|
196
|
-
else
|
197
|
-
'' ## remove class attrib completely
|
198
|
-
end
|
199
|
-
end
|
200
|
-
|
201
|
-
html
|
202
|
-
end
|
203
|
-
|
204
|
-
|
205
|
-
end # class Sanitizer
|
206
|
-
|
207
|
-
end # module Factbook
|