factbook 0.0.1 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gemtest +0 -0
- data/Manifest.txt +10 -0
- data/README.md +24 -2
- data/Rakefile +2 -1
- data/lib/factbook/page.rb +408 -0
- data/lib/factbook/version.rb +1 -1
- data/lib/factbook.rb +4 -0
- data/test/data/countrytemplate_au.html +4179 -0
- data/test/data/countrytemplate_be.html +4260 -0
- data/test/data/countrytemplate_br.html +4366 -0
- data/test/data/countrytemplate_mx.html +4397 -0
- data/test/helper.rb +15 -0
- data/test/test_json.rb +45 -0
- data/test/test_page.rb +227 -0
- data/test/test_page_old.rb +290 -0
- data/test/test_strip.rb +66 -0
- metadata +37 -11
data/.gemtest
ADDED
File without changes
|
data/Manifest.txt
CHANGED
@@ -3,4 +3,14 @@ Manifest.txt
|
|
3
3
|
README.md
|
4
4
|
Rakefile
|
5
5
|
lib/factbook.rb
|
6
|
+
lib/factbook/page.rb
|
6
7
|
lib/factbook/version.rb
|
8
|
+
test/data/countrytemplate_au.html
|
9
|
+
test/data/countrytemplate_be.html
|
10
|
+
test/data/countrytemplate_br.html
|
11
|
+
test/data/countrytemplate_mx.html
|
12
|
+
test/helper.rb
|
13
|
+
test/test_json.rb
|
14
|
+
test/test_page.rb
|
15
|
+
test/test_page_old.rb
|
16
|
+
test/test_strip.rb
|
data/README.md
CHANGED
@@ -7,9 +7,30 @@
|
|
7
7
|
* forum :: [groups.google.com/group/openmundi](https://groups.google.com/group/openmundi)
|
8
8
|
|
9
9
|
|
10
|
+
|
11
|
+
## What's the World Factbook?
|
12
|
+
|
13
|
+
The World Factbook published by the Central Intelligence Agency (CIA)
|
14
|
+
offers free country profiles in the public domain (that is, no copyright(s), no rights reserved).
|
15
|
+
|
16
|
+
- [1] [The World Factbook](https://www.cia.gov/library/publications/the-world-factbook/)
|
17
|
+
- [2] [Wikipedia Article: The World Factbook](http://en.wikipedia.org/wiki/The_World_Factbook)
|
18
|
+
|
19
|
+
|
10
20
|
## Usage
|
11
21
|
|
12
|
-
|
22
|
+
### Get page as a hash (that is, structured data e.g. nested key/values)
|
23
|
+
|
24
|
+
page = Factbook::Page.new( 'br' )
|
25
|
+
pp page.data # pretty print hash
|
26
|
+
|
27
|
+
### Save to disk as JSON
|
28
|
+
|
29
|
+
page = Factbook::Page.new( 'br' )
|
30
|
+
File.open( 'br.json', 'w') do |f|
|
31
|
+
f.write( JSON.pretty_generate( page.data ) )
|
32
|
+
end
|
33
|
+
|
13
34
|
|
14
35
|
## Install
|
15
36
|
|
@@ -18,9 +39,10 @@ Just install the gem:
|
|
18
39
|
$ gem install factbook
|
19
40
|
|
20
41
|
|
42
|
+
|
21
43
|
## Alternatives
|
22
44
|
|
23
|
-
|
45
|
+
- [worldfactbook gem](https://github.com/sayem/worldfactbook) by sayem (aka Sayem Khan); fetches data from its own mirror, that is, rubyworldfactbook.com (last updated 2011?)
|
24
46
|
|
25
47
|
|
26
48
|
## License
|
data/Rakefile
CHANGED
@@ -0,0 +1,408 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
module Factbook
|
4
|
+
|
5
|
+
class Page
|
6
|
+
|
7
|
+
include LogUtils::Logging
|
8
|
+
|
9
|
+
## standard version
|
10
|
+
## SITE_BASE = 'https://www.cia.gov/library/publications/the-world-factbook/geos/{code}.html'
|
11
|
+
|
12
|
+
## -- use text (low-bandwidth) version
|
13
|
+
## e.g. www.cia.gov/library/publications/the-world-factbook/geos/countrytemplate_br.html
|
14
|
+
SITE_BASE = 'https://www.cia.gov/library/publications/the-world-factbook/geos/countrytemplate_{code}.html'
|
15
|
+
|
16
|
+
def initialize( code )
|
17
|
+
@code = code
|
18
|
+
end
|
19
|
+
|
20
|
+
def doc
|
21
|
+
@doc ||= Nokogiri::HTML( html )
|
22
|
+
end
|
23
|
+
|
24
|
+
def data
|
25
|
+
if @data.nil?
|
26
|
+
titles = [
|
27
|
+
'intro',
|
28
|
+
'geo',
|
29
|
+
'people',
|
30
|
+
'govt',
|
31
|
+
'econ',
|
32
|
+
'energy',
|
33
|
+
'comm',
|
34
|
+
'trans',
|
35
|
+
'military',
|
36
|
+
'issues' ]
|
37
|
+
|
38
|
+
@data = {}
|
39
|
+
|
40
|
+
sects.each_with_index do |sect,i|
|
41
|
+
logger.debug "############################"
|
42
|
+
logger.debug "### stats sect #{i}:"
|
43
|
+
|
44
|
+
@data[ titles[i] ] = sect_to_hash( sect )
|
45
|
+
end
|
46
|
+
end
|
47
|
+
@data
|
48
|
+
end
|
49
|
+
|
50
|
+
|
51
|
+
def sects
|
52
|
+
## split html into sections
|
53
|
+
## to avoid errors w/ nested tags
|
54
|
+
|
55
|
+
divs = [
|
56
|
+
'<div id="CollapsiblePanel1_Intro"',
|
57
|
+
'<div id="CollapsiblePanel1_Geo"',
|
58
|
+
'<div id="CollapsiblePanel1_People"',
|
59
|
+
'<div id="CollapsiblePanel1_Govt"',
|
60
|
+
'<div id="CollapsiblePanel1_Econ"',
|
61
|
+
'<div id="CollapsiblePanel1_Energy"',
|
62
|
+
'<div id="CollapsiblePanel1_Comm"',
|
63
|
+
'<div id="CollapsiblePanel1_Trans"',
|
64
|
+
'<div id="CollapsiblePanel1_Military"',
|
65
|
+
'<div id="CollapsiblePanel1_Issues"' ]
|
66
|
+
|
67
|
+
if @sects.nil?
|
68
|
+
@sects = []
|
69
|
+
|
70
|
+
@pos = []
|
71
|
+
divs.each_with_index do |div,i|
|
72
|
+
p = html.index( div )
|
73
|
+
if p.nil?
|
74
|
+
## issue error: if not found
|
75
|
+
puts "*** error: section not found -- #{div}"
|
76
|
+
else
|
77
|
+
puts " found section #{i} @ #{p}"
|
78
|
+
end
|
79
|
+
|
80
|
+
@pos << p
|
81
|
+
end
|
82
|
+
@pos << -1 ## note: last entry add -1 for until the end of document
|
83
|
+
|
84
|
+
divs.each_with_index do |div,i|
|
85
|
+
from = @pos[i]
|
86
|
+
to = @pos[i+1]
|
87
|
+
to -= 1 unless to == -1 ## note: sub one (-1) unless end-of-string (-1)
|
88
|
+
|
89
|
+
## todo: check that from is smaller than to
|
90
|
+
puts " cut section #{i} [#{from}..#{to}]"
|
91
|
+
@sects << Nokogiri::HTML( html[ from..to ] )
|
92
|
+
|
93
|
+
if i==0 || i==1
|
94
|
+
# puts "debug sect #{i}:"
|
95
|
+
# puts ">>>|||#{html[ from..to ]}|||<<<"
|
96
|
+
end
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
100
|
+
@sects
|
101
|
+
end
|
102
|
+
|
103
|
+
def html=(html)
|
104
|
+
## for debugging n testing
|
105
|
+
## lets you set html (no need to fetch via net)
|
106
|
+
@html = html
|
107
|
+
end
|
108
|
+
|
109
|
+
def html
|
110
|
+
if @html.nil?
|
111
|
+
@html = fetch()
|
112
|
+
|
113
|
+
### remove everything up to
|
114
|
+
## <div id="countryInfo" style="display: none;">
|
115
|
+
## remove everything starting w/ footer
|
116
|
+
## remove head !!!
|
117
|
+
## in body remove header n footer
|
118
|
+
|
119
|
+
## remove inline script
|
120
|
+
@html = @html.gsub( /<script[^>]*>.*?<\/script>/m ) do |m|
|
121
|
+
puts "remove script:"
|
122
|
+
puts "#{m}"
|
123
|
+
''
|
124
|
+
end
|
125
|
+
|
126
|
+
## remove inline style
|
127
|
+
@html = @html.gsub( /<style[^>]*>.*?<\/style>/m ) do |m|
|
128
|
+
puts "remove style:"
|
129
|
+
puts "#{m}"
|
130
|
+
''
|
131
|
+
end
|
132
|
+
|
133
|
+
## remove link
|
134
|
+
link_regex = /<link[^>]+>/
|
135
|
+
@html = @html.gsub( link_regex ) do |m|
|
136
|
+
puts "remove link:"
|
137
|
+
puts "#{m}"
|
138
|
+
''
|
139
|
+
end
|
140
|
+
|
141
|
+
div_country_info_regex = /<div id="countryInfo"\s*>/
|
142
|
+
## remove everything before <div id="countryInfo" >
|
143
|
+
pos = @html.index( div_country_info_regex )
|
144
|
+
if pos # not nil, false
|
145
|
+
@html = @html[pos..-1]
|
146
|
+
end
|
147
|
+
|
148
|
+
## remove country comparison
|
149
|
+
## e.g. <span class="category" >country comparison to the world:</span>
|
150
|
+
## <span class="category_data">
|
151
|
+
## <a href="../rankorder/2147rank.html?countryname=Brazil&countrycode=br®ionCode=soa&rank=5#br" onMouseDown="" title="Country comparison to the world" alt="Country comparison to the world">
|
152
|
+
## 5
|
153
|
+
## </a>
|
154
|
+
## </span>
|
155
|
+
|
156
|
+
##
|
157
|
+
##
|
158
|
+
## <span class="category" style="padding-left:7px;">country comparison to the world:</span> <span class="category_data">
|
159
|
+
## <a href="../rankorder/2147rank.html?countryname=Brazil&countrycode=br®ionCode=soa&rank=5#br" onMouseDown="" title="Country comparison to the world" alt="Country comparison to the world"> 5 </a> </span>
|
160
|
+
##
|
161
|
+
|
162
|
+
country_comparison_regex = /
|
163
|
+
<span \s class="category"[^>]*>
|
164
|
+
country \s comparison \s to \s the \s world:
|
165
|
+
<\/span>
|
166
|
+
\s*
|
167
|
+
<span \s class="category_data"[^>]*>
|
168
|
+
\s*
|
169
|
+
<a \s [^>]+>
|
170
|
+
.+?
|
171
|
+
<\/a>
|
172
|
+
\s*
|
173
|
+
<\/span>
|
174
|
+
/xm
|
175
|
+
|
176
|
+
@html = @html.gsub( country_comparison_regex ) do |m|
|
177
|
+
puts "remove country comparison:"
|
178
|
+
puts "#{m}"
|
179
|
+
''
|
180
|
+
end
|
181
|
+
|
182
|
+
style_attr_regex = /\s*style="[^"]+"/
|
183
|
+
@html = @html.gsub( style_attr_regex ) do |m|
|
184
|
+
puts "remove style attr:"
|
185
|
+
puts "#{m}"
|
186
|
+
''
|
187
|
+
end
|
188
|
+
|
189
|
+
## <tr height="22">
|
190
|
+
## <td class="category_data"></td>
|
191
|
+
## </tr>
|
192
|
+
tr_empty_regex = /
|
193
|
+
<tr[^>]*>
|
194
|
+
\s*
|
195
|
+
<td[^>]*> \s* <\/td>
|
196
|
+
\s*
|
197
|
+
<\/tr>
|
198
|
+
/xm
|
199
|
+
@html = @html.gsub( tr_empty_regex ) do |m|
|
200
|
+
puts "remove tr emtpy:"
|
201
|
+
puts "#{m}"
|
202
|
+
''
|
203
|
+
end
|
204
|
+
|
205
|
+
## remove world leader website promo
|
206
|
+
## <span class="category">(For more information visit the
|
207
|
+
## <a href="/library/publications/world-leaders-1/index.html" target="_blank">World Leaders website</a>
|
208
|
+
## <img src="../graphics/soa_newwindow.gif" alt="Opens in New Window" title="Opens in New Window" border="0"/>)
|
209
|
+
## </span>
|
210
|
+
world_leaders_website_regex = /
|
211
|
+
<span \s class="category"[^>]*>
|
212
|
+
\(
|
213
|
+
For \s more \s information \s
|
214
|
+
.+? ## non-greedy (smallest possible match
|
215
|
+
\)
|
216
|
+
<\/span>
|
217
|
+
/xm
|
218
|
+
@html = @html.gsub( world_leaders_website_regex ) do |m|
|
219
|
+
puts "remove world leader website promo:"
|
220
|
+
puts "#{m}"
|
221
|
+
''
|
222
|
+
end
|
223
|
+
|
224
|
+
end
|
225
|
+
@html
|
226
|
+
end
|
227
|
+
|
228
|
+
private
|
229
|
+
def fetch
|
230
|
+
uri_string = SITE_BASE.gsub( '{code}', @code )
|
231
|
+
|
232
|
+
worker = Fetcher::Worker.new
|
233
|
+
response = worker.get_response( uri_string )
|
234
|
+
|
235
|
+
if response.code == '200'
|
236
|
+
t = response.body
|
237
|
+
###
|
238
|
+
# NB: Net::HTTP will NOT set encoding UTF-8 etc.
|
239
|
+
# will mostly be ASCII
|
240
|
+
# - try to change encoding to UTF-8 ourselves
|
241
|
+
logger.debug "t.encoding.name (before): #{t.encoding.name}"
|
242
|
+
#####
|
243
|
+
# NB: ASCII-8BIT == BINARY == Encoding Unknown; Raw Bytes Here
|
244
|
+
|
245
|
+
## NB:
|
246
|
+
# for now "hardcoded" to utf8 - what else can we do?
|
247
|
+
# - note: force_encoding will NOT change the chars only change the assumed encoding w/o translation
|
248
|
+
t = t.force_encoding( Encoding::UTF_8 )
|
249
|
+
logger.debug "t.encoding.name (after): #{t.encoding.name}"
|
250
|
+
## pp t
|
251
|
+
t
|
252
|
+
else
|
253
|
+
logger.error "fetch HTTP - #{response.code} #{response.message}"
|
254
|
+
nil
|
255
|
+
end
|
256
|
+
end
|
257
|
+
|
258
|
+
|
259
|
+
def cleanup_key( key )
|
260
|
+
## to lower case
|
261
|
+
key = key.downcase
|
262
|
+
## seaport(s) => seaports
|
263
|
+
key = key.gsub( '(s)', 's' )
|
264
|
+
key = key.gsub( ':', '' ) # trailing :
|
265
|
+
## remove special chars ()-/,'
|
266
|
+
key = key.gsub( /[()\-\/,]'/, ' ')
|
267
|
+
key = key.strip
|
268
|
+
key = key.gsub( /[ ]+/, '_' )
|
269
|
+
key
|
270
|
+
end
|
271
|
+
|
272
|
+
|
273
|
+
def sect_to_hash( sect )
|
274
|
+
|
275
|
+
rows = sect.css( 'table tr' )
|
276
|
+
cells = sect.css( 'table tr td' )
|
277
|
+
field_ids = rows.css( '#field' ) ## check - use div#field.category -- possible?
|
278
|
+
data_ids = rows.css( '#data' )
|
279
|
+
|
280
|
+
logger.debug "rows.size: #{rows.size} (cells.size: #{cells.size} / field_ids.size: #{field_ids.size} / data_ids.size: #{data_ids.size})"
|
281
|
+
|
282
|
+
hash = {}
|
283
|
+
last_cat = nil
|
284
|
+
|
285
|
+
cells.each_with_index do |cell,i|
|
286
|
+
## next if i > 14 ## skip after xx for debugging for now
|
287
|
+
|
288
|
+
# check if field or data id
|
289
|
+
# check for (nested) div#field in td
|
290
|
+
has_field_id = cell.css( '#field' ).size == 1 ? true : false
|
291
|
+
|
292
|
+
# check for td#data
|
293
|
+
has_data_id = cell['id'] == 'data' ? true : false
|
294
|
+
|
295
|
+
if has_field_id
|
296
|
+
|
297
|
+
cats = cell.css( 'div.category' ) ## note: ignore all .category not using div (issue warn/err if found!!) etc.
|
298
|
+
if cats.size == 1
|
299
|
+
text = cleanup_key( cats.first.text.strip ) # remove/strip leading and trailing spaces
|
300
|
+
last_cat = text
|
301
|
+
logger.debug " [#{i}] category: >>#{text}<<"
|
302
|
+
else
|
303
|
+
logger.warn "**** !!!!!! warn/err - found element w/ field id (no match for subsection!!! - check)"
|
304
|
+
logger.warn cell.to_s
|
305
|
+
end
|
306
|
+
|
307
|
+
elsif has_data_id
|
308
|
+
|
309
|
+
cats = cell.css( 'div.category' ) ## note: ignore all .category not using div (issue warn/err if found!!) etc.
|
310
|
+
cats_data = cell.css( 'div.category_data,span.category_data' ) ## note: ignore a.category_data etc.
|
311
|
+
cats_div_data = cell.css( 'div.category_data' )
|
312
|
+
cats_span_data = cell.css( 'span.category_data' )
|
313
|
+
|
314
|
+
logger.debug " - [#{i}] data cell - cats: #{cats.size}, cats_data: #{cats_data.size} (cats_div_data: #{cats_div_data.size} / cats_span_data: #{cats_span_data.size})"
|
315
|
+
|
316
|
+
pairs = []
|
317
|
+
last_pair = nil
|
318
|
+
last_pair_data_count = 0
|
319
|
+
|
320
|
+
## loop over div blocks (might be .category or .category_data)
|
321
|
+
cell.children.each_with_index do |child,j|
|
322
|
+
unless child.element?
|
323
|
+
## puts " **** !!!! skipping non-element type >#{child.type}<:"
|
324
|
+
## puts child.to_s
|
325
|
+
next
|
326
|
+
end
|
327
|
+
unless child.name == 'div'
|
328
|
+
logger.warn " **** !!! skipping non-div >#{child.name}<:"
|
329
|
+
logger.warn child.to_s
|
330
|
+
next
|
331
|
+
end
|
332
|
+
|
333
|
+
### check if .category or .category_data
|
334
|
+
if child['class'] == 'category'
|
335
|
+
|
336
|
+
## collect text for category; exclude element w/ class.category_data
|
337
|
+
text = ""
|
338
|
+
child.children.each do |subchild|
|
339
|
+
text << subchild.text.strip unless subchild.element? && subchild['class'] == 'category_data'
|
340
|
+
end
|
341
|
+
text = cleanup_key( text )
|
342
|
+
|
343
|
+
value = child.css('span.category_data').text.strip
|
344
|
+
|
345
|
+
logger.debug " -- category >>#{text}<<"
|
346
|
+
|
347
|
+
## start new pair
|
348
|
+
last_pair = [ text, value ]
|
349
|
+
last_pair_data_count = 0
|
350
|
+
pairs << last_pair
|
351
|
+
|
352
|
+
elsif child['class'] == 'category_data'
|
353
|
+
logger.debug " -- category_data"
|
354
|
+
|
355
|
+
text = child.text.strip
|
356
|
+
|
357
|
+
if last_pair.nil?
|
358
|
+
## assume its the very first entry; use implied/auto-created category
|
359
|
+
last_pair = [ 'text', '' ]
|
360
|
+
last_pair_data_count = 0
|
361
|
+
pairs << last_pair
|
362
|
+
end
|
363
|
+
|
364
|
+
### first category_data element?
|
365
|
+
if last_pair_data_count == 0
|
366
|
+
if last_pair[1] == ''
|
367
|
+
last_pair[1] = text
|
368
|
+
else
|
369
|
+
last_pair[1] += " #{text}" ## append w/o separator
|
370
|
+
end
|
371
|
+
else
|
372
|
+
if last_cat == 'demographic_profile' ## special case (use space a sep)
|
373
|
+
last_pair[1] += " #{text}" ## append with separator
|
374
|
+
else
|
375
|
+
last_pair[1] += "; #{text}" ## append with separator
|
376
|
+
end
|
377
|
+
end
|
378
|
+
last_pair_data_count += 1
|
379
|
+
|
380
|
+
else
|
381
|
+
logger.warn " **** !!! skipping div w/o category or category_data class:"
|
382
|
+
logger.warn child.to_s
|
383
|
+
end
|
384
|
+
end
|
385
|
+
|
386
|
+
## pp pairs
|
387
|
+
|
388
|
+
## pairs to hash
|
389
|
+
pairs_hash = {}
|
390
|
+
pairs.each do |pair|
|
391
|
+
pairs_hash[ pair[0] ] = pair[1]
|
392
|
+
end
|
393
|
+
|
394
|
+
hash[ last_cat ] = pairs_hash
|
395
|
+
|
396
|
+
else
|
397
|
+
logger.warn "#### !!!! unknown cell type (no field or data id found):"
|
398
|
+
logger.warn cell.to_s
|
399
|
+
end
|
400
|
+
end # each cell
|
401
|
+
|
402
|
+
hash # return hash
|
403
|
+
|
404
|
+
end # method sect_to_hash
|
405
|
+
|
406
|
+
end # class Page
|
407
|
+
|
408
|
+
end # module Factbook
|
data/lib/factbook/version.rb
CHANGED
data/lib/factbook.rb
CHANGED
@@ -6,6 +6,7 @@ require 'net/http'
|
|
6
6
|
require 'uri'
|
7
7
|
require 'cgi'
|
8
8
|
require 'pp'
|
9
|
+
require 'json'
|
9
10
|
|
10
11
|
|
11
12
|
## 3rd party gems/libs
|
@@ -13,10 +14,13 @@ require 'pp'
|
|
13
14
|
|
14
15
|
require 'logutils'
|
15
16
|
require 'fetcher'
|
17
|
+
require 'nokogiri'
|
18
|
+
|
16
19
|
|
17
20
|
# our own code
|
18
21
|
|
19
22
|
require 'factbook/version' # let it always go first
|
23
|
+
require 'factbook/page'
|
20
24
|
|
21
25
|
|
22
26
|
module Factbook
|