factbook 0.0.1 → 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.gemtest +0 -0
- data/Manifest.txt +10 -0
- data/README.md +24 -2
- data/Rakefile +2 -1
- data/lib/factbook/page.rb +408 -0
- data/lib/factbook/version.rb +1 -1
- data/lib/factbook.rb +4 -0
- data/test/data/countrytemplate_au.html +4179 -0
- data/test/data/countrytemplate_be.html +4260 -0
- data/test/data/countrytemplate_br.html +4366 -0
- data/test/data/countrytemplate_mx.html +4397 -0
- data/test/helper.rb +15 -0
- data/test/test_json.rb +45 -0
- data/test/test_page.rb +227 -0
- data/test/test_page_old.rb +290 -0
- data/test/test_strip.rb +66 -0
- metadata +37 -11
data/.gemtest
ADDED
File without changes
|
data/Manifest.txt
CHANGED
@@ -3,4 +3,14 @@ Manifest.txt
|
|
3
3
|
README.md
|
4
4
|
Rakefile
|
5
5
|
lib/factbook.rb
|
6
|
+
lib/factbook/page.rb
|
6
7
|
lib/factbook/version.rb
|
8
|
+
test/data/countrytemplate_au.html
|
9
|
+
test/data/countrytemplate_be.html
|
10
|
+
test/data/countrytemplate_br.html
|
11
|
+
test/data/countrytemplate_mx.html
|
12
|
+
test/helper.rb
|
13
|
+
test/test_json.rb
|
14
|
+
test/test_page.rb
|
15
|
+
test/test_page_old.rb
|
16
|
+
test/test_strip.rb
|
data/README.md
CHANGED
@@ -7,9 +7,30 @@
|
|
7
7
|
* forum :: [groups.google.com/group/openmundi](https://groups.google.com/group/openmundi)
|
8
8
|
|
9
9
|
|
10
|
+
|
11
|
+
## What's the World Factbook?
|
12
|
+
|
13
|
+
The World Factbook published by the Central Intelligence Agency (CIA)
|
14
|
+
offers free country profiles in the public domain (that is, no copyright(s), no rights reserved).
|
15
|
+
|
16
|
+
- [1] [The World Factbook](https://www.cia.gov/library/publications/the-world-factbook/)
|
17
|
+
- [2] [Wikipedia Article: The World Factbook](http://en.wikipedia.org/wiki/The_World_Factbook)
|
18
|
+
|
19
|
+
|
10
20
|
## Usage
|
11
21
|
|
12
|
-
|
22
|
+
### Get page as a hash (that is, structured data e.g. nested key/values)
|
23
|
+
|
24
|
+
page = Factbook::Page.new( 'br' )
|
25
|
+
pp page.data # pretty print hash
|
26
|
+
|
27
|
+
### Save to disk as JSON
|
28
|
+
|
29
|
+
page = Factbook::Page.new( 'br' )
|
30
|
+
File.open( 'br.json', 'w') do |f|
|
31
|
+
f.write( JSON.pretty_generate( page.data ) )
|
32
|
+
end
|
33
|
+
|
13
34
|
|
14
35
|
## Install
|
15
36
|
|
@@ -18,9 +39,10 @@ Just install the gem:
|
|
18
39
|
$ gem install factbook
|
19
40
|
|
20
41
|
|
42
|
+
|
21
43
|
## Alternatives
|
22
44
|
|
23
|
-
|
45
|
+
- [worldfactbook gem](https://github.com/sayem/worldfactbook) by sayem (aka Sayem Khan); fetches data from its own mirror, that is, rubyworldfactbook.com (last updated 2011?)
|
24
46
|
|
25
47
|
|
26
48
|
## License
|
data/Rakefile
CHANGED
@@ -0,0 +1,408 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
module Factbook
|
4
|
+
|
5
|
+
class Page
|
6
|
+
|
7
|
+
include LogUtils::Logging
|
8
|
+
|
9
|
+
## standard version
|
10
|
+
## SITE_BASE = 'https://www.cia.gov/library/publications/the-world-factbook/geos/{code}.html'
|
11
|
+
|
12
|
+
## -- use text (low-bandwidth) version
|
13
|
+
## e.g. www.cia.gov/library/publications/the-world-factbook/geos/countrytemplate_br.html
|
14
|
+
SITE_BASE = 'https://www.cia.gov/library/publications/the-world-factbook/geos/countrytemplate_{code}.html'
|
15
|
+
|
16
|
+
def initialize( code )
|
17
|
+
@code = code
|
18
|
+
end
|
19
|
+
|
20
|
+
def doc
|
21
|
+
@doc ||= Nokogiri::HTML( html )
|
22
|
+
end
|
23
|
+
|
24
|
+
def data
|
25
|
+
if @data.nil?
|
26
|
+
titles = [
|
27
|
+
'intro',
|
28
|
+
'geo',
|
29
|
+
'people',
|
30
|
+
'govt',
|
31
|
+
'econ',
|
32
|
+
'energy',
|
33
|
+
'comm',
|
34
|
+
'trans',
|
35
|
+
'military',
|
36
|
+
'issues' ]
|
37
|
+
|
38
|
+
@data = {}
|
39
|
+
|
40
|
+
sects.each_with_index do |sect,i|
|
41
|
+
logger.debug "############################"
|
42
|
+
logger.debug "### stats sect #{i}:"
|
43
|
+
|
44
|
+
@data[ titles[i] ] = sect_to_hash( sect )
|
45
|
+
end
|
46
|
+
end
|
47
|
+
@data
|
48
|
+
end
|
49
|
+
|
50
|
+
|
51
|
+
def sects
|
52
|
+
## split html into sections
|
53
|
+
## to avoid errors w/ nested tags
|
54
|
+
|
55
|
+
divs = [
|
56
|
+
'<div id="CollapsiblePanel1_Intro"',
|
57
|
+
'<div id="CollapsiblePanel1_Geo"',
|
58
|
+
'<div id="CollapsiblePanel1_People"',
|
59
|
+
'<div id="CollapsiblePanel1_Govt"',
|
60
|
+
'<div id="CollapsiblePanel1_Econ"',
|
61
|
+
'<div id="CollapsiblePanel1_Energy"',
|
62
|
+
'<div id="CollapsiblePanel1_Comm"',
|
63
|
+
'<div id="CollapsiblePanel1_Trans"',
|
64
|
+
'<div id="CollapsiblePanel1_Military"',
|
65
|
+
'<div id="CollapsiblePanel1_Issues"' ]
|
66
|
+
|
67
|
+
if @sects.nil?
|
68
|
+
@sects = []
|
69
|
+
|
70
|
+
@pos = []
|
71
|
+
divs.each_with_index do |div,i|
|
72
|
+
p = html.index( div )
|
73
|
+
if p.nil?
|
74
|
+
## issue error: if not found
|
75
|
+
puts "*** error: section not found -- #{div}"
|
76
|
+
else
|
77
|
+
puts " found section #{i} @ #{p}"
|
78
|
+
end
|
79
|
+
|
80
|
+
@pos << p
|
81
|
+
end
|
82
|
+
@pos << -1 ## note: last entry add -1 for until the end of document
|
83
|
+
|
84
|
+
divs.each_with_index do |div,i|
|
85
|
+
from = @pos[i]
|
86
|
+
to = @pos[i+1]
|
87
|
+
to -= 1 unless to == -1 ## note: sub one (-1) unless end-of-string (-1)
|
88
|
+
|
89
|
+
## todo: check that from is smaller than to
|
90
|
+
puts " cut section #{i} [#{from}..#{to}]"
|
91
|
+
@sects << Nokogiri::HTML( html[ from..to ] )
|
92
|
+
|
93
|
+
if i==0 || i==1
|
94
|
+
# puts "debug sect #{i}:"
|
95
|
+
# puts ">>>|||#{html[ from..to ]}|||<<<"
|
96
|
+
end
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
100
|
+
@sects
|
101
|
+
end
|
102
|
+
|
103
|
+
def html=(html)
|
104
|
+
## for debugging n testing
|
105
|
+
## lets you set html (no need to fetch via net)
|
106
|
+
@html = html
|
107
|
+
end
|
108
|
+
|
109
|
+
def html
|
110
|
+
if @html.nil?
|
111
|
+
@html = fetch()
|
112
|
+
|
113
|
+
### remove everything up to
|
114
|
+
## <div id="countryInfo" style="display: none;">
|
115
|
+
## remove everything starting w/ footer
|
116
|
+
## remove head !!!
|
117
|
+
## in body remove header n footer
|
118
|
+
|
119
|
+
## remove inline script
|
120
|
+
@html = @html.gsub( /<script[^>]*>.*?<\/script>/m ) do |m|
|
121
|
+
puts "remove script:"
|
122
|
+
puts "#{m}"
|
123
|
+
''
|
124
|
+
end
|
125
|
+
|
126
|
+
## remove inline style
|
127
|
+
@html = @html.gsub( /<style[^>]*>.*?<\/style>/m ) do |m|
|
128
|
+
puts "remove style:"
|
129
|
+
puts "#{m}"
|
130
|
+
''
|
131
|
+
end
|
132
|
+
|
133
|
+
## remove link
|
134
|
+
link_regex = /<link[^>]+>/
|
135
|
+
@html = @html.gsub( link_regex ) do |m|
|
136
|
+
puts "remove link:"
|
137
|
+
puts "#{m}"
|
138
|
+
''
|
139
|
+
end
|
140
|
+
|
141
|
+
div_country_info_regex = /<div id="countryInfo"\s*>/
|
142
|
+
## remove everything before <div id="countryInfo" >
|
143
|
+
pos = @html.index( div_country_info_regex )
|
144
|
+
if pos # not nil, false
|
145
|
+
@html = @html[pos..-1]
|
146
|
+
end
|
147
|
+
|
148
|
+
## remove country comparison
|
149
|
+
## e.g. <span class="category" >country comparison to the world:</span>
|
150
|
+
## <span class="category_data">
|
151
|
+
## <a href="../rankorder/2147rank.html?countryname=Brazil&countrycode=br®ionCode=soa&rank=5#br" onMouseDown="" title="Country comparison to the world" alt="Country comparison to the world">
|
152
|
+
## 5
|
153
|
+
## </a>
|
154
|
+
## </span>
|
155
|
+
|
156
|
+
##
|
157
|
+
##
|
158
|
+
## <span class="category" style="padding-left:7px;">country comparison to the world:</span> <span class="category_data">
|
159
|
+
## <a href="../rankorder/2147rank.html?countryname=Brazil&countrycode=br®ionCode=soa&rank=5#br" onMouseDown="" title="Country comparison to the world" alt="Country comparison to the world"> 5 </a> </span>
|
160
|
+
##
|
161
|
+
|
162
|
+
country_comparison_regex = /
|
163
|
+
<span \s class="category"[^>]*>
|
164
|
+
country \s comparison \s to \s the \s world:
|
165
|
+
<\/span>
|
166
|
+
\s*
|
167
|
+
<span \s class="category_data"[^>]*>
|
168
|
+
\s*
|
169
|
+
<a \s [^>]+>
|
170
|
+
.+?
|
171
|
+
<\/a>
|
172
|
+
\s*
|
173
|
+
<\/span>
|
174
|
+
/xm
|
175
|
+
|
176
|
+
@html = @html.gsub( country_comparison_regex ) do |m|
|
177
|
+
puts "remove country comparison:"
|
178
|
+
puts "#{m}"
|
179
|
+
''
|
180
|
+
end
|
181
|
+
|
182
|
+
style_attr_regex = /\s*style="[^"]+"/
|
183
|
+
@html = @html.gsub( style_attr_regex ) do |m|
|
184
|
+
puts "remove style attr:"
|
185
|
+
puts "#{m}"
|
186
|
+
''
|
187
|
+
end
|
188
|
+
|
189
|
+
## <tr height="22">
|
190
|
+
## <td class="category_data"></td>
|
191
|
+
## </tr>
|
192
|
+
tr_empty_regex = /
|
193
|
+
<tr[^>]*>
|
194
|
+
\s*
|
195
|
+
<td[^>]*> \s* <\/td>
|
196
|
+
\s*
|
197
|
+
<\/tr>
|
198
|
+
/xm
|
199
|
+
@html = @html.gsub( tr_empty_regex ) do |m|
|
200
|
+
puts "remove tr emtpy:"
|
201
|
+
puts "#{m}"
|
202
|
+
''
|
203
|
+
end
|
204
|
+
|
205
|
+
## remove world leader website promo
|
206
|
+
## <span class="category">(For more information visit the
|
207
|
+
## <a href="/library/publications/world-leaders-1/index.html" target="_blank">World Leaders website</a>
|
208
|
+
## <img src="../graphics/soa_newwindow.gif" alt="Opens in New Window" title="Opens in New Window" border="0"/>)
|
209
|
+
## </span>
|
210
|
+
world_leaders_website_regex = /
|
211
|
+
<span \s class="category"[^>]*>
|
212
|
+
\(
|
213
|
+
For \s more \s information \s
|
214
|
+
.+? ## non-greedy (smallest possible match
|
215
|
+
\)
|
216
|
+
<\/span>
|
217
|
+
/xm
|
218
|
+
@html = @html.gsub( world_leaders_website_regex ) do |m|
|
219
|
+
puts "remove world leader website promo:"
|
220
|
+
puts "#{m}"
|
221
|
+
''
|
222
|
+
end
|
223
|
+
|
224
|
+
end
|
225
|
+
@html
|
226
|
+
end
|
227
|
+
|
228
|
+
private
|
229
|
+
def fetch
|
230
|
+
uri_string = SITE_BASE.gsub( '{code}', @code )
|
231
|
+
|
232
|
+
worker = Fetcher::Worker.new
|
233
|
+
response = worker.get_response( uri_string )
|
234
|
+
|
235
|
+
if response.code == '200'
|
236
|
+
t = response.body
|
237
|
+
###
|
238
|
+
# NB: Net::HTTP will NOT set encoding UTF-8 etc.
|
239
|
+
# will mostly be ASCII
|
240
|
+
# - try to change encoding to UTF-8 ourselves
|
241
|
+
logger.debug "t.encoding.name (before): #{t.encoding.name}"
|
242
|
+
#####
|
243
|
+
# NB: ASCII-8BIT == BINARY == Encoding Unknown; Raw Bytes Here
|
244
|
+
|
245
|
+
## NB:
|
246
|
+
# for now "hardcoded" to utf8 - what else can we do?
|
247
|
+
# - note: force_encoding will NOT change the chars only change the assumed encoding w/o translation
|
248
|
+
t = t.force_encoding( Encoding::UTF_8 )
|
249
|
+
logger.debug "t.encoding.name (after): #{t.encoding.name}"
|
250
|
+
## pp t
|
251
|
+
t
|
252
|
+
else
|
253
|
+
logger.error "fetch HTTP - #{response.code} #{response.message}"
|
254
|
+
nil
|
255
|
+
end
|
256
|
+
end
|
257
|
+
|
258
|
+
|
259
|
+
def cleanup_key( key )
|
260
|
+
## to lower case
|
261
|
+
key = key.downcase
|
262
|
+
## seaport(s) => seaports
|
263
|
+
key = key.gsub( '(s)', 's' )
|
264
|
+
key = key.gsub( ':', '' ) # trailing :
|
265
|
+
## remove special chars ()-/,'
|
266
|
+
key = key.gsub( /[()\-\/,]'/, ' ')
|
267
|
+
key = key.strip
|
268
|
+
key = key.gsub( /[ ]+/, '_' )
|
269
|
+
key
|
270
|
+
end
|
271
|
+
|
272
|
+
|
273
|
+
def sect_to_hash( sect )
|
274
|
+
|
275
|
+
rows = sect.css( 'table tr' )
|
276
|
+
cells = sect.css( 'table tr td' )
|
277
|
+
field_ids = rows.css( '#field' ) ## check - use div#field.category -- possible?
|
278
|
+
data_ids = rows.css( '#data' )
|
279
|
+
|
280
|
+
logger.debug "rows.size: #{rows.size} (cells.size: #{cells.size} / field_ids.size: #{field_ids.size} / data_ids.size: #{data_ids.size})"
|
281
|
+
|
282
|
+
hash = {}
|
283
|
+
last_cat = nil
|
284
|
+
|
285
|
+
cells.each_with_index do |cell,i|
|
286
|
+
## next if i > 14 ## skip after xx for debugging for now
|
287
|
+
|
288
|
+
# check if field or data id
|
289
|
+
# check for (nested) div#field in td
|
290
|
+
has_field_id = cell.css( '#field' ).size == 1 ? true : false
|
291
|
+
|
292
|
+
# check for td#data
|
293
|
+
has_data_id = cell['id'] == 'data' ? true : false
|
294
|
+
|
295
|
+
if has_field_id
|
296
|
+
|
297
|
+
cats = cell.css( 'div.category' ) ## note: ignore all .category not using div (issue warn/err if found!!) etc.
|
298
|
+
if cats.size == 1
|
299
|
+
text = cleanup_key( cats.first.text.strip ) # remove/strip leading and trailing spaces
|
300
|
+
last_cat = text
|
301
|
+
logger.debug " [#{i}] category: >>#{text}<<"
|
302
|
+
else
|
303
|
+
logger.warn "**** !!!!!! warn/err - found element w/ field id (no match for subsection!!! - check)"
|
304
|
+
logger.warn cell.to_s
|
305
|
+
end
|
306
|
+
|
307
|
+
elsif has_data_id
|
308
|
+
|
309
|
+
cats = cell.css( 'div.category' ) ## note: ignore all .category not using div (issue warn/err if found!!) etc.
|
310
|
+
cats_data = cell.css( 'div.category_data,span.category_data' ) ## note: ignore a.category_data etc.
|
311
|
+
cats_div_data = cell.css( 'div.category_data' )
|
312
|
+
cats_span_data = cell.css( 'span.category_data' )
|
313
|
+
|
314
|
+
logger.debug " - [#{i}] data cell - cats: #{cats.size}, cats_data: #{cats_data.size} (cats_div_data: #{cats_div_data.size} / cats_span_data: #{cats_span_data.size})"
|
315
|
+
|
316
|
+
pairs = []
|
317
|
+
last_pair = nil
|
318
|
+
last_pair_data_count = 0
|
319
|
+
|
320
|
+
## loop over div blocks (might be .category or .category_data)
|
321
|
+
cell.children.each_with_index do |child,j|
|
322
|
+
unless child.element?
|
323
|
+
## puts " **** !!!! skipping non-element type >#{child.type}<:"
|
324
|
+
## puts child.to_s
|
325
|
+
next
|
326
|
+
end
|
327
|
+
unless child.name == 'div'
|
328
|
+
logger.warn " **** !!! skipping non-div >#{child.name}<:"
|
329
|
+
logger.warn child.to_s
|
330
|
+
next
|
331
|
+
end
|
332
|
+
|
333
|
+
### check if .category or .category_data
|
334
|
+
if child['class'] == 'category'
|
335
|
+
|
336
|
+
## collect text for category; exclude element w/ class.category_data
|
337
|
+
text = ""
|
338
|
+
child.children.each do |subchild|
|
339
|
+
text << subchild.text.strip unless subchild.element? && subchild['class'] == 'category_data'
|
340
|
+
end
|
341
|
+
text = cleanup_key( text )
|
342
|
+
|
343
|
+
value = child.css('span.category_data').text.strip
|
344
|
+
|
345
|
+
logger.debug " -- category >>#{text}<<"
|
346
|
+
|
347
|
+
## start new pair
|
348
|
+
last_pair = [ text, value ]
|
349
|
+
last_pair_data_count = 0
|
350
|
+
pairs << last_pair
|
351
|
+
|
352
|
+
elsif child['class'] == 'category_data'
|
353
|
+
logger.debug " -- category_data"
|
354
|
+
|
355
|
+
text = child.text.strip
|
356
|
+
|
357
|
+
if last_pair.nil?
|
358
|
+
## assume its the very first entry; use implied/auto-created category
|
359
|
+
last_pair = [ 'text', '' ]
|
360
|
+
last_pair_data_count = 0
|
361
|
+
pairs << last_pair
|
362
|
+
end
|
363
|
+
|
364
|
+
### first category_data element?
|
365
|
+
if last_pair_data_count == 0
|
366
|
+
if last_pair[1] == ''
|
367
|
+
last_pair[1] = text
|
368
|
+
else
|
369
|
+
last_pair[1] += " #{text}" ## append w/o separator
|
370
|
+
end
|
371
|
+
else
|
372
|
+
if last_cat == 'demographic_profile' ## special case (use space a sep)
|
373
|
+
last_pair[1] += " #{text}" ## append with separator
|
374
|
+
else
|
375
|
+
last_pair[1] += "; #{text}" ## append with separator
|
376
|
+
end
|
377
|
+
end
|
378
|
+
last_pair_data_count += 1
|
379
|
+
|
380
|
+
else
|
381
|
+
logger.warn " **** !!! skipping div w/o category or category_data class:"
|
382
|
+
logger.warn child.to_s
|
383
|
+
end
|
384
|
+
end
|
385
|
+
|
386
|
+
## pp pairs
|
387
|
+
|
388
|
+
## pairs to hash
|
389
|
+
pairs_hash = {}
|
390
|
+
pairs.each do |pair|
|
391
|
+
pairs_hash[ pair[0] ] = pair[1]
|
392
|
+
end
|
393
|
+
|
394
|
+
hash[ last_cat ] = pairs_hash
|
395
|
+
|
396
|
+
else
|
397
|
+
logger.warn "#### !!!! unknown cell type (no field or data id found):"
|
398
|
+
logger.warn cell.to_s
|
399
|
+
end
|
400
|
+
end # each cell
|
401
|
+
|
402
|
+
hash # return hash
|
403
|
+
|
404
|
+
end # method sect_to_hash
|
405
|
+
|
406
|
+
end # class Page
|
407
|
+
|
408
|
+
end # module Factbook
|
data/lib/factbook/version.rb
CHANGED
data/lib/factbook.rb
CHANGED
@@ -6,6 +6,7 @@ require 'net/http'
|
|
6
6
|
require 'uri'
|
7
7
|
require 'cgi'
|
8
8
|
require 'pp'
|
9
|
+
require 'json'
|
9
10
|
|
10
11
|
|
11
12
|
## 3rd party gems/libs
|
@@ -13,10 +14,13 @@ require 'pp'
|
|
13
14
|
|
14
15
|
require 'logutils'
|
15
16
|
require 'fetcher'
|
17
|
+
require 'nokogiri'
|
18
|
+
|
16
19
|
|
17
20
|
# our own code
|
18
21
|
|
19
22
|
require 'factbook/version' # let it always go first
|
23
|
+
require 'factbook/page'
|
20
24
|
|
21
25
|
|
22
26
|
module Factbook
|