factbook 0.1.1 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Manifest.txt +5 -0
- data/README.md +6 -3
- data/Rakefile +309 -17
- data/lib/factbook/page.rb +58 -201
- data/lib/factbook/sect.rb +179 -0
- data/lib/factbook/version.rb +1 -1
- data/lib/factbook.rb +2 -0
- data/test/data/countrytemplate_ee.html +2999 -0
- data/test/data/countrytemplate_ls.html +2728 -0
- data/test/data/countrytemplate_vt.html +1726 -0
- data/test/data/countrytemplate_xx.html +2898 -0
- data/test/test_json.rb +31 -29
- data/test/test_page.rb +18 -209
- data/test/test_page_old.rb +191 -3
- metadata +17 -12
data/lib/factbook/page.rb
CHANGED
@@ -2,8 +2,8 @@
|
|
2
2
|
|
3
3
|
module Factbook
|
4
4
|
|
5
|
-
class Page
|
6
5
|
|
6
|
+
class Page
|
7
7
|
include LogUtils::Logging
|
8
8
|
|
9
9
|
## standard version
|
@@ -14,7 +14,15 @@ module Factbook
|
|
14
14
|
SITE_BASE = 'https://www.cia.gov/library/publications/the-world-factbook/geos/countrytemplate_{code}.html'
|
15
15
|
|
16
16
|
def initialize( code )
|
17
|
-
|
17
|
+
## note: requires factbook country code
|
18
|
+
# e.g. austria is au
|
19
|
+
# germany is gm and so on
|
20
|
+
@code = code
|
21
|
+
|
22
|
+
@html = nil
|
23
|
+
@doc = nil
|
24
|
+
@sects = nil
|
25
|
+
@data = nil
|
18
26
|
end
|
19
27
|
|
20
28
|
def doc
|
@@ -32,25 +40,13 @@ module Factbook
|
|
32
40
|
|
33
41
|
def data
|
34
42
|
if @data.nil?
|
35
|
-
titles = [
|
36
|
-
'intro',
|
37
|
-
'geo',
|
38
|
-
'people',
|
39
|
-
'govt',
|
40
|
-
'econ',
|
41
|
-
'energy',
|
42
|
-
'comm',
|
43
|
-
'trans',
|
44
|
-
'military',
|
45
|
-
'issues' ]
|
46
|
-
|
47
43
|
@data = {}
|
48
44
|
|
49
45
|
sects.each_with_index do |sect,i|
|
50
46
|
logger.debug "############################"
|
51
|
-
logger.debug "### stats sect
|
47
|
+
logger.debug "### [#{i}] stats sect >#{sect.title}<: "
|
52
48
|
|
53
|
-
@data[
|
49
|
+
@data[ sect.title ] = sect.data
|
54
50
|
end
|
55
51
|
end
|
56
52
|
@data
|
@@ -58,51 +54,60 @@ module Factbook
|
|
58
54
|
|
59
55
|
|
60
56
|
def sects
|
61
|
-
## split html into sections
|
62
|
-
## to avoid errors w/ nested tags
|
63
|
-
|
64
|
-
divs = [
|
65
|
-
'<div id="CollapsiblePanel1_Intro"',
|
66
|
-
'<div id="CollapsiblePanel1_Geo"',
|
67
|
-
'<div id="CollapsiblePanel1_People"',
|
68
|
-
'<div id="CollapsiblePanel1_Govt"',
|
69
|
-
'<div id="CollapsiblePanel1_Econ"',
|
70
|
-
'<div id="CollapsiblePanel1_Energy"',
|
71
|
-
'<div id="CollapsiblePanel1_Comm"',
|
72
|
-
'<div id="CollapsiblePanel1_Trans"',
|
73
|
-
'<div id="CollapsiblePanel1_Military"',
|
74
|
-
'<div id="CollapsiblePanel1_Issues"' ]
|
75
|
-
|
76
57
|
if @sects.nil?
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
divs
|
58
|
+
## split html into sections
|
59
|
+
## lets us avoids errors w/ (wrongly) nested tags
|
60
|
+
|
61
|
+
divs = [
|
62
|
+
[ 'intro', '<div id="CollapsiblePanel1_Intro"' ],
|
63
|
+
[ 'geo', '<div id="CollapsiblePanel1_Geo"' ],
|
64
|
+
[ 'people', '<div id="CollapsiblePanel1_People"' ],
|
65
|
+
[ 'govt', '<div id="CollapsiblePanel1_Govt"' ],
|
66
|
+
[ 'econ', '<div id="CollapsiblePanel1_Econ"' ],
|
67
|
+
[ 'energy', '<div id="CollapsiblePanel1_Energy"' ],
|
68
|
+
[ 'comm', '<div id="CollapsiblePanel1_Comm"' ],
|
69
|
+
[ 'trans', '<div id="CollapsiblePanel1_Trans"' ],
|
70
|
+
[ 'military', '<div id="CollapsiblePanel1_Military"'],
|
71
|
+
[ 'issues', '<div id="CollapsiblePanel1_Issues"' ]
|
72
|
+
]
|
73
|
+
|
74
|
+
indexes = []
|
75
|
+
|
76
|
+
## note:
|
77
|
+
## skip missing sections (w/ warning)
|
78
|
+
## e.g. Vatican (Holy See), Liechtenstein etc. have no Energy section, for example
|
79
|
+
|
80
|
+
divs.each_with_index do |rec,i|
|
81
|
+
title = rec[0]
|
82
|
+
div = rec[1]
|
81
83
|
p = html.index( div )
|
82
84
|
if p.nil?
|
83
|
-
## issue
|
84
|
-
|
85
|
+
## issue warning: if not found
|
86
|
+
logger.warn "***!!! section not found -- #{div} --; skipping"
|
85
87
|
else
|
86
|
-
|
88
|
+
logger.debug " found section #{i} @ #{p}"
|
89
|
+
indexes << [title,p]
|
87
90
|
end
|
88
|
-
|
89
|
-
@pos << p
|
90
91
|
end
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
92
|
+
|
93
|
+
@sects = []
|
94
|
+
|
95
|
+
indexes.each_with_index do |rec,i|
|
96
|
+
title = rec[0]
|
97
|
+
from = rec[1]
|
98
|
+
|
99
|
+
# is last entry? if yes use -1 otherewise pos
|
100
|
+
# note: subtract one (-1) from pos unless end-of-string (-1)
|
101
|
+
to = indexes[i+1].nil? ? -1 : indexes[i+1][1]-1
|
97
102
|
|
98
103
|
## todo: check that from is smaller than to
|
99
|
-
|
100
|
-
@sects <<
|
101
|
-
|
102
|
-
if i==0 || i==1
|
103
|
-
|
104
|
-
|
105
|
-
end
|
104
|
+
logger.debug " cut section #{i} [#{from}..#{to}]"
|
105
|
+
@sects << Sect.new( title, html[ from..to ] )
|
106
|
+
|
107
|
+
##if i==0 || i==1
|
108
|
+
## puts "debug sect #{i}:"
|
109
|
+
## puts ">>>|||#{html[ from..to ]}|||<<<"
|
110
|
+
##end
|
106
111
|
end
|
107
112
|
end
|
108
113
|
|
@@ -264,154 +269,6 @@ module Factbook
|
|
264
269
|
end
|
265
270
|
end
|
266
271
|
|
267
|
-
|
268
|
-
def cleanup_key( key )
|
269
|
-
## to lower case
|
270
|
-
key = key.downcase
|
271
|
-
## seaport(s) => seaports
|
272
|
-
key = key.gsub( '(s)', 's' )
|
273
|
-
key = key.gsub( ':', '' ) # trailing :
|
274
|
-
## remove special chars ()-/,'
|
275
|
-
key = key.gsub( /['()\-\/,]/, ' ' )
|
276
|
-
key = key.strip
|
277
|
-
key = key.gsub( /[ ]+/, '_' )
|
278
|
-
key
|
279
|
-
end
|
280
|
-
|
281
|
-
|
282
|
-
def sect_to_hash( sect )
|
283
|
-
|
284
|
-
rows = sect.css( 'table tr' )
|
285
|
-
cells = sect.css( 'table tr td' )
|
286
|
-
field_ids = rows.css( '#field' ) ## check - use div#field.category -- possible?
|
287
|
-
data_ids = rows.css( '#data' )
|
288
|
-
|
289
|
-
logger.debug "rows.size: #{rows.size} (cells.size: #{cells.size} / field_ids.size: #{field_ids.size} / data_ids.size: #{data_ids.size})"
|
290
|
-
|
291
|
-
hash = {}
|
292
|
-
last_cat = nil
|
293
|
-
|
294
|
-
cells.each_with_index do |cell,i|
|
295
|
-
## next if i > 14 ## skip after xx for debugging for now
|
296
|
-
|
297
|
-
# check if field or data id
|
298
|
-
# check for (nested) div#field in td
|
299
|
-
has_field_id = cell.css( '#field' ).size == 1 ? true : false
|
300
|
-
|
301
|
-
# check for td#data
|
302
|
-
has_data_id = cell['id'] == 'data' ? true : false
|
303
|
-
|
304
|
-
if has_field_id
|
305
|
-
|
306
|
-
cats = cell.css( 'div.category' ) ## note: ignore all .category not using div (issue warn/err if found!!) etc.
|
307
|
-
if cats.size == 1
|
308
|
-
text = cleanup_key( cats.first.text.strip ) # remove/strip leading and trailing spaces
|
309
|
-
last_cat = text
|
310
|
-
logger.debug " [#{i}] category: >>#{text}<<"
|
311
|
-
else
|
312
|
-
logger.warn "**** !!!!!! warn/err - found element w/ field id (no match for subsection!!! - check)"
|
313
|
-
logger.warn cell.to_s
|
314
|
-
end
|
315
|
-
|
316
|
-
elsif has_data_id
|
317
|
-
|
318
|
-
cats = cell.css( 'div.category' ) ## note: ignore all .category not using div (issue warn/err if found!!) etc.
|
319
|
-
cats_data = cell.css( 'div.category_data,span.category_data' ) ## note: ignore a.category_data etc.
|
320
|
-
cats_div_data = cell.css( 'div.category_data' )
|
321
|
-
cats_span_data = cell.css( 'span.category_data' )
|
322
|
-
|
323
|
-
logger.debug " - [#{i}] data cell - cats: #{cats.size}, cats_data: #{cats_data.size} (cats_div_data: #{cats_div_data.size} / cats_span_data: #{cats_span_data.size})"
|
324
|
-
|
325
|
-
pairs = []
|
326
|
-
last_pair = nil
|
327
|
-
last_pair_data_count = 0
|
328
|
-
|
329
|
-
## loop over div blocks (might be .category or .category_data)
|
330
|
-
cell.children.each_with_index do |child,j|
|
331
|
-
unless child.element?
|
332
|
-
## puts " **** !!!! skipping non-element type >#{child.type}<:"
|
333
|
-
## puts child.to_s
|
334
|
-
next
|
335
|
-
end
|
336
|
-
unless child.name == 'div'
|
337
|
-
logger.warn " **** !!! skipping non-div >#{child.name}<:"
|
338
|
-
logger.warn child.to_s
|
339
|
-
next
|
340
|
-
end
|
341
|
-
|
342
|
-
### check if .category or .category_data
|
343
|
-
if child['class'] == 'category'
|
344
|
-
|
345
|
-
## collect text for category; exclude element w/ class.category_data
|
346
|
-
text = ""
|
347
|
-
child.children.each do |subchild|
|
348
|
-
text << subchild.text.strip unless subchild.element? && subchild['class'] == 'category_data'
|
349
|
-
end
|
350
|
-
text = cleanup_key( text )
|
351
|
-
|
352
|
-
value = child.css('span.category_data').text.strip
|
353
|
-
|
354
|
-
logger.debug " -- category >>#{text}<<"
|
355
|
-
|
356
|
-
## start new pair
|
357
|
-
last_pair = [ text, value ]
|
358
|
-
last_pair_data_count = 0
|
359
|
-
pairs << last_pair
|
360
|
-
|
361
|
-
elsif child['class'] == 'category_data'
|
362
|
-
logger.debug " -- category_data"
|
363
|
-
|
364
|
-
text = child.text.strip
|
365
|
-
|
366
|
-
if last_pair.nil?
|
367
|
-
## assume its the very first entry; use implied/auto-created category
|
368
|
-
last_pair = [ 'text', '' ]
|
369
|
-
last_pair_data_count = 0
|
370
|
-
pairs << last_pair
|
371
|
-
end
|
372
|
-
|
373
|
-
### first category_data element?
|
374
|
-
if last_pair_data_count == 0
|
375
|
-
if last_pair[1] == ''
|
376
|
-
last_pair[1] = text
|
377
|
-
else
|
378
|
-
last_pair[1] += " #{text}" ## append w/o separator
|
379
|
-
end
|
380
|
-
else
|
381
|
-
if last_cat == 'demographic_profile' ## special case (use space a sep)
|
382
|
-
last_pair[1] += " #{text}" ## append with separator
|
383
|
-
else
|
384
|
-
last_pair[1] += "; #{text}" ## append with separator
|
385
|
-
end
|
386
|
-
end
|
387
|
-
last_pair_data_count += 1
|
388
|
-
|
389
|
-
else
|
390
|
-
logger.warn " **** !!! skipping div w/o category or category_data class:"
|
391
|
-
logger.warn child.to_s
|
392
|
-
end
|
393
|
-
end
|
394
|
-
|
395
|
-
## pp pairs
|
396
|
-
|
397
|
-
## pairs to hash
|
398
|
-
pairs_hash = {}
|
399
|
-
pairs.each do |pair|
|
400
|
-
pairs_hash[ pair[0] ] = pair[1]
|
401
|
-
end
|
402
|
-
|
403
|
-
hash[ last_cat ] = pairs_hash
|
404
|
-
|
405
|
-
else
|
406
|
-
logger.warn "#### !!!! unknown cell type (no field or data id found):"
|
407
|
-
logger.warn cell.to_s
|
408
|
-
end
|
409
|
-
end # each cell
|
410
|
-
|
411
|
-
hash # return hash
|
412
|
-
|
413
|
-
end # method sect_to_hash
|
414
|
-
|
415
272
|
end # class Page
|
416
273
|
|
417
274
|
end # module Factbook
|
@@ -0,0 +1,179 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
module Factbook
|
4
|
+
|
5
|
+
class Sect # section (e.g. Introduction/Geography/People/Economy/Energy/Transport/etc.)
|
6
|
+
include LogUtils::Logging
|
7
|
+
|
8
|
+
attr_reader :title, :html
|
9
|
+
|
10
|
+
def initialize( title, html )
|
11
|
+
## todo: passing a ref to the parent page - why? why not??
|
12
|
+
@title = title
|
13
|
+
@html = html
|
14
|
+
|
15
|
+
@doc = nil
|
16
|
+
@data = nil
|
17
|
+
end
|
18
|
+
|
19
|
+
def doc
|
20
|
+
### check: use nokogiri html fragment? why? why not??
|
21
|
+
@doc ||= Nokogiri::HTML( @html )
|
22
|
+
end
|
23
|
+
|
24
|
+
def data
|
25
|
+
@data ||= sect_to_hash( doc )
|
26
|
+
end
|
27
|
+
|
28
|
+
private
|
29
|
+
|
30
|
+
def cleanup_key( key )
|
31
|
+
## to lower case
|
32
|
+
key = key.downcase
|
33
|
+
## seaport(s) => seaports
|
34
|
+
key = key.gsub( '(s)', 's' )
|
35
|
+
key = key.gsub( ':', '' ) # trailing :
|
36
|
+
## remove special chars ()-/,'
|
37
|
+
key = key.gsub( /['()\-\/,]/, ' ' )
|
38
|
+
key = key.strip
|
39
|
+
key = key.gsub( /[ ]+/, '_' )
|
40
|
+
key
|
41
|
+
end
|
42
|
+
|
43
|
+
|
44
|
+
def sect_to_hash( sect )
|
45
|
+
|
46
|
+
rows = sect.css( 'table tr' )
|
47
|
+
cells = sect.css( 'table tr td' )
|
48
|
+
field_ids = rows.css( '#field' ) ## check - use div#field.category -- possible?
|
49
|
+
data_ids = rows.css( '#data' )
|
50
|
+
|
51
|
+
logger.debug "rows.size: #{rows.size} (cells.size: #{cells.size} / field_ids.size: #{field_ids.size} / data_ids.size: #{data_ids.size})"
|
52
|
+
|
53
|
+
hash = {}
|
54
|
+
last_cat = nil
|
55
|
+
|
56
|
+
cells.each_with_index do |cell,i|
|
57
|
+
## next if i > 14 ## skip after xx for debugging for now
|
58
|
+
|
59
|
+
# check if field or data id
|
60
|
+
# check for (nested) div#field in td
|
61
|
+
has_field_id = cell.css( '#field' ).size == 1 ? true : false
|
62
|
+
|
63
|
+
# check for td#data
|
64
|
+
has_data_id = cell['id'] == 'data' ? true : false
|
65
|
+
|
66
|
+
if has_field_id
|
67
|
+
|
68
|
+
cats = cell.css( 'div.category' ) ## note: ignore all .category not using div (issue warn/err if found!!) etc.
|
69
|
+
if cats.size == 1
|
70
|
+
text = cleanup_key( cats.first.text.strip ) # remove/strip leading and trailing spaces
|
71
|
+
last_cat = text
|
72
|
+
logger.debug " [#{i}] category: >>#{text}<<"
|
73
|
+
else
|
74
|
+
logger.warn "**** !!!!!! warn/err - found element w/ field id (no match for subsection!!! - check)"
|
75
|
+
logger.warn cell.to_s
|
76
|
+
end
|
77
|
+
|
78
|
+
elsif has_data_id
|
79
|
+
|
80
|
+
cats = cell.css( 'div.category' ) ## note: ignore all .category not using div (issue warn/err if found!!) etc.
|
81
|
+
cats_data = cell.css( 'div.category_data,span.category_data' ) ## note: ignore a.category_data etc.
|
82
|
+
cats_div_data = cell.css( 'div.category_data' )
|
83
|
+
cats_span_data = cell.css( 'span.category_data' )
|
84
|
+
|
85
|
+
logger.debug " - [#{i}] data cell - cats: #{cats.size}, cats_data: #{cats_data.size} (cats_div_data: #{cats_div_data.size} / cats_span_data: #{cats_span_data.size})"
|
86
|
+
|
87
|
+
pairs = []
|
88
|
+
last_pair = nil
|
89
|
+
last_pair_data_count = 0
|
90
|
+
|
91
|
+
## loop over div blocks (might be .category or .category_data)
|
92
|
+
cell.children.each_with_index do |child,j|
|
93
|
+
unless child.element?
|
94
|
+
## puts " **** !!!! skipping non-element type >#{child.type}<:"
|
95
|
+
## puts child.to_s
|
96
|
+
next
|
97
|
+
end
|
98
|
+
unless child.name == 'div'
|
99
|
+
logger.warn " **** !!! skipping non-div >#{child.name}<:"
|
100
|
+
logger.warn child.to_s
|
101
|
+
next
|
102
|
+
end
|
103
|
+
|
104
|
+
### check if .category or .category_data
|
105
|
+
if child['class'] == 'category'
|
106
|
+
|
107
|
+
## collect text for category; exclude element w/ class.category_data
|
108
|
+
text = ""
|
109
|
+
child.children.each do |subchild|
|
110
|
+
text << subchild.text.strip unless subchild.element? && subchild['class'] == 'category_data'
|
111
|
+
end
|
112
|
+
text = cleanup_key( text )
|
113
|
+
|
114
|
+
value = child.css('span.category_data').text.strip
|
115
|
+
|
116
|
+
logger.debug " -- category >>#{text}<<"
|
117
|
+
|
118
|
+
## start new pair
|
119
|
+
last_pair = [ text, value ]
|
120
|
+
last_pair_data_count = 0
|
121
|
+
pairs << last_pair
|
122
|
+
|
123
|
+
elsif child['class'] == 'category_data'
|
124
|
+
logger.debug " -- category_data"
|
125
|
+
|
126
|
+
text = child.text.strip
|
127
|
+
|
128
|
+
if last_pair.nil?
|
129
|
+
## assume its the very first entry; use implied/auto-created category
|
130
|
+
last_pair = [ 'text', '' ]
|
131
|
+
last_pair_data_count = 0
|
132
|
+
pairs << last_pair
|
133
|
+
end
|
134
|
+
|
135
|
+
### first category_data element?
|
136
|
+
if last_pair_data_count == 0
|
137
|
+
if last_pair[1] == ''
|
138
|
+
last_pair[1] = text
|
139
|
+
else
|
140
|
+
last_pair[1] += " #{text}" ## append w/o separator
|
141
|
+
end
|
142
|
+
else
|
143
|
+
if last_cat == 'demographic_profile' ## special case (use space a sep)
|
144
|
+
last_pair[1] += " #{text}" ## append with separator
|
145
|
+
else
|
146
|
+
last_pair[1] += "; #{text}" ## append with separator
|
147
|
+
end
|
148
|
+
end
|
149
|
+
last_pair_data_count += 1
|
150
|
+
|
151
|
+
else
|
152
|
+
logger.warn " **** !!! skipping div w/o category or category_data class:"
|
153
|
+
logger.warn child.to_s
|
154
|
+
end
|
155
|
+
end
|
156
|
+
|
157
|
+
## pp pairs
|
158
|
+
|
159
|
+
## pairs to hash
|
160
|
+
pairs_hash = {}
|
161
|
+
pairs.each do |pair|
|
162
|
+
pairs_hash[ pair[0] ] = pair[1]
|
163
|
+
end
|
164
|
+
|
165
|
+
hash[ last_cat ] = pairs_hash
|
166
|
+
|
167
|
+
else
|
168
|
+
logger.warn "#### !!!! unknown cell type (no field or data id found):"
|
169
|
+
logger.warn cell.to_s
|
170
|
+
end
|
171
|
+
end # each cell
|
172
|
+
|
173
|
+
hash # return hash
|
174
|
+
|
175
|
+
end # method sect_to_hash
|
176
|
+
|
177
|
+
end # class Sect
|
178
|
+
|
179
|
+
end # module Factbook
|
data/lib/factbook/version.rb
CHANGED
data/lib/factbook.rb
CHANGED
@@ -7,6 +7,7 @@ require 'uri'
|
|
7
7
|
require 'cgi'
|
8
8
|
require 'pp'
|
9
9
|
require 'json'
|
10
|
+
require 'fileutils'
|
10
11
|
|
11
12
|
|
12
13
|
## 3rd party gems/libs
|
@@ -21,6 +22,7 @@ require 'nokogiri'
|
|
21
22
|
|
22
23
|
require 'factbook/version' # let it always go first
|
23
24
|
require 'factbook/page'
|
25
|
+
require 'factbook/sect'
|
24
26
|
|
25
27
|
|
26
28
|
module Factbook
|