factbook 0.1.1 → 0.1.2
Sign up to get free protection for your applications and to get access to all the features.
- data/Manifest.txt +5 -0
- data/README.md +6 -3
- data/Rakefile +309 -17
- data/lib/factbook/page.rb +58 -201
- data/lib/factbook/sect.rb +179 -0
- data/lib/factbook/version.rb +1 -1
- data/lib/factbook.rb +2 -0
- data/test/data/countrytemplate_ee.html +2999 -0
- data/test/data/countrytemplate_ls.html +2728 -0
- data/test/data/countrytemplate_vt.html +1726 -0
- data/test/data/countrytemplate_xx.html +2898 -0
- data/test/test_json.rb +31 -29
- data/test/test_page.rb +18 -209
- data/test/test_page_old.rb +191 -3
- metadata +17 -12
data/lib/factbook/page.rb
CHANGED
@@ -2,8 +2,8 @@
|
|
2
2
|
|
3
3
|
module Factbook
|
4
4
|
|
5
|
-
class Page
|
6
5
|
|
6
|
+
class Page
|
7
7
|
include LogUtils::Logging
|
8
8
|
|
9
9
|
## standard version
|
@@ -14,7 +14,15 @@ module Factbook
|
|
14
14
|
SITE_BASE = 'https://www.cia.gov/library/publications/the-world-factbook/geos/countrytemplate_{code}.html'
|
15
15
|
|
16
16
|
def initialize( code )
|
17
|
-
|
17
|
+
## note: requires factbook country code
|
18
|
+
# e.g. austria is au
|
19
|
+
# germany is gm and so on
|
20
|
+
@code = code
|
21
|
+
|
22
|
+
@html = nil
|
23
|
+
@doc = nil
|
24
|
+
@sects = nil
|
25
|
+
@data = nil
|
18
26
|
end
|
19
27
|
|
20
28
|
def doc
|
@@ -32,25 +40,13 @@ module Factbook
|
|
32
40
|
|
33
41
|
def data
|
34
42
|
if @data.nil?
|
35
|
-
titles = [
|
36
|
-
'intro',
|
37
|
-
'geo',
|
38
|
-
'people',
|
39
|
-
'govt',
|
40
|
-
'econ',
|
41
|
-
'energy',
|
42
|
-
'comm',
|
43
|
-
'trans',
|
44
|
-
'military',
|
45
|
-
'issues' ]
|
46
|
-
|
47
43
|
@data = {}
|
48
44
|
|
49
45
|
sects.each_with_index do |sect,i|
|
50
46
|
logger.debug "############################"
|
51
|
-
logger.debug "### stats sect
|
47
|
+
logger.debug "### [#{i}] stats sect >#{sect.title}<: "
|
52
48
|
|
53
|
-
@data[
|
49
|
+
@data[ sect.title ] = sect.data
|
54
50
|
end
|
55
51
|
end
|
56
52
|
@data
|
@@ -58,51 +54,60 @@ module Factbook
|
|
58
54
|
|
59
55
|
|
60
56
|
def sects
|
61
|
-
## split html into sections
|
62
|
-
## to avoid errors w/ nested tags
|
63
|
-
|
64
|
-
divs = [
|
65
|
-
'<div id="CollapsiblePanel1_Intro"',
|
66
|
-
'<div id="CollapsiblePanel1_Geo"',
|
67
|
-
'<div id="CollapsiblePanel1_People"',
|
68
|
-
'<div id="CollapsiblePanel1_Govt"',
|
69
|
-
'<div id="CollapsiblePanel1_Econ"',
|
70
|
-
'<div id="CollapsiblePanel1_Energy"',
|
71
|
-
'<div id="CollapsiblePanel1_Comm"',
|
72
|
-
'<div id="CollapsiblePanel1_Trans"',
|
73
|
-
'<div id="CollapsiblePanel1_Military"',
|
74
|
-
'<div id="CollapsiblePanel1_Issues"' ]
|
75
|
-
|
76
57
|
if @sects.nil?
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
divs
|
58
|
+
## split html into sections
|
59
|
+
## lets us avoids errors w/ (wrongly) nested tags
|
60
|
+
|
61
|
+
divs = [
|
62
|
+
[ 'intro', '<div id="CollapsiblePanel1_Intro"' ],
|
63
|
+
[ 'geo', '<div id="CollapsiblePanel1_Geo"' ],
|
64
|
+
[ 'people', '<div id="CollapsiblePanel1_People"' ],
|
65
|
+
[ 'govt', '<div id="CollapsiblePanel1_Govt"' ],
|
66
|
+
[ 'econ', '<div id="CollapsiblePanel1_Econ"' ],
|
67
|
+
[ 'energy', '<div id="CollapsiblePanel1_Energy"' ],
|
68
|
+
[ 'comm', '<div id="CollapsiblePanel1_Comm"' ],
|
69
|
+
[ 'trans', '<div id="CollapsiblePanel1_Trans"' ],
|
70
|
+
[ 'military', '<div id="CollapsiblePanel1_Military"'],
|
71
|
+
[ 'issues', '<div id="CollapsiblePanel1_Issues"' ]
|
72
|
+
]
|
73
|
+
|
74
|
+
indexes = []
|
75
|
+
|
76
|
+
## note:
|
77
|
+
## skip missing sections (w/ warning)
|
78
|
+
## e.g. Vatican (Holy See), Liechtenstein etc. have no Energy section, for example
|
79
|
+
|
80
|
+
divs.each_with_index do |rec,i|
|
81
|
+
title = rec[0]
|
82
|
+
div = rec[1]
|
81
83
|
p = html.index( div )
|
82
84
|
if p.nil?
|
83
|
-
## issue
|
84
|
-
|
85
|
+
## issue warning: if not found
|
86
|
+
logger.warn "***!!! section not found -- #{div} --; skipping"
|
85
87
|
else
|
86
|
-
|
88
|
+
logger.debug " found section #{i} @ #{p}"
|
89
|
+
indexes << [title,p]
|
87
90
|
end
|
88
|
-
|
89
|
-
@pos << p
|
90
91
|
end
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
92
|
+
|
93
|
+
@sects = []
|
94
|
+
|
95
|
+
indexes.each_with_index do |rec,i|
|
96
|
+
title = rec[0]
|
97
|
+
from = rec[1]
|
98
|
+
|
99
|
+
# is last entry? if yes use -1 otherewise pos
|
100
|
+
# note: subtract one (-1) from pos unless end-of-string (-1)
|
101
|
+
to = indexes[i+1].nil? ? -1 : indexes[i+1][1]-1
|
97
102
|
|
98
103
|
## todo: check that from is smaller than to
|
99
|
-
|
100
|
-
@sects <<
|
101
|
-
|
102
|
-
if i==0 || i==1
|
103
|
-
|
104
|
-
|
105
|
-
end
|
104
|
+
logger.debug " cut section #{i} [#{from}..#{to}]"
|
105
|
+
@sects << Sect.new( title, html[ from..to ] )
|
106
|
+
|
107
|
+
##if i==0 || i==1
|
108
|
+
## puts "debug sect #{i}:"
|
109
|
+
## puts ">>>|||#{html[ from..to ]}|||<<<"
|
110
|
+
##end
|
106
111
|
end
|
107
112
|
end
|
108
113
|
|
@@ -264,154 +269,6 @@ module Factbook
|
|
264
269
|
end
|
265
270
|
end
|
266
271
|
|
267
|
-
|
268
|
-
def cleanup_key( key )
|
269
|
-
## to lower case
|
270
|
-
key = key.downcase
|
271
|
-
## seaport(s) => seaports
|
272
|
-
key = key.gsub( '(s)', 's' )
|
273
|
-
key = key.gsub( ':', '' ) # trailing :
|
274
|
-
## remove special chars ()-/,'
|
275
|
-
key = key.gsub( /['()\-\/,]/, ' ' )
|
276
|
-
key = key.strip
|
277
|
-
key = key.gsub( /[ ]+/, '_' )
|
278
|
-
key
|
279
|
-
end
|
280
|
-
|
281
|
-
|
282
|
-
def sect_to_hash( sect )
|
283
|
-
|
284
|
-
rows = sect.css( 'table tr' )
|
285
|
-
cells = sect.css( 'table tr td' )
|
286
|
-
field_ids = rows.css( '#field' ) ## check - use div#field.category -- possible?
|
287
|
-
data_ids = rows.css( '#data' )
|
288
|
-
|
289
|
-
logger.debug "rows.size: #{rows.size} (cells.size: #{cells.size} / field_ids.size: #{field_ids.size} / data_ids.size: #{data_ids.size})"
|
290
|
-
|
291
|
-
hash = {}
|
292
|
-
last_cat = nil
|
293
|
-
|
294
|
-
cells.each_with_index do |cell,i|
|
295
|
-
## next if i > 14 ## skip after xx for debugging for now
|
296
|
-
|
297
|
-
# check if field or data id
|
298
|
-
# check for (nested) div#field in td
|
299
|
-
has_field_id = cell.css( '#field' ).size == 1 ? true : false
|
300
|
-
|
301
|
-
# check for td#data
|
302
|
-
has_data_id = cell['id'] == 'data' ? true : false
|
303
|
-
|
304
|
-
if has_field_id
|
305
|
-
|
306
|
-
cats = cell.css( 'div.category' ) ## note: ignore all .category not using div (issue warn/err if found!!) etc.
|
307
|
-
if cats.size == 1
|
308
|
-
text = cleanup_key( cats.first.text.strip ) # remove/strip leading and trailing spaces
|
309
|
-
last_cat = text
|
310
|
-
logger.debug " [#{i}] category: >>#{text}<<"
|
311
|
-
else
|
312
|
-
logger.warn "**** !!!!!! warn/err - found element w/ field id (no match for subsection!!! - check)"
|
313
|
-
logger.warn cell.to_s
|
314
|
-
end
|
315
|
-
|
316
|
-
elsif has_data_id
|
317
|
-
|
318
|
-
cats = cell.css( 'div.category' ) ## note: ignore all .category not using div (issue warn/err if found!!) etc.
|
319
|
-
cats_data = cell.css( 'div.category_data,span.category_data' ) ## note: ignore a.category_data etc.
|
320
|
-
cats_div_data = cell.css( 'div.category_data' )
|
321
|
-
cats_span_data = cell.css( 'span.category_data' )
|
322
|
-
|
323
|
-
logger.debug " - [#{i}] data cell - cats: #{cats.size}, cats_data: #{cats_data.size} (cats_div_data: #{cats_div_data.size} / cats_span_data: #{cats_span_data.size})"
|
324
|
-
|
325
|
-
pairs = []
|
326
|
-
last_pair = nil
|
327
|
-
last_pair_data_count = 0
|
328
|
-
|
329
|
-
## loop over div blocks (might be .category or .category_data)
|
330
|
-
cell.children.each_with_index do |child,j|
|
331
|
-
unless child.element?
|
332
|
-
## puts " **** !!!! skipping non-element type >#{child.type}<:"
|
333
|
-
## puts child.to_s
|
334
|
-
next
|
335
|
-
end
|
336
|
-
unless child.name == 'div'
|
337
|
-
logger.warn " **** !!! skipping non-div >#{child.name}<:"
|
338
|
-
logger.warn child.to_s
|
339
|
-
next
|
340
|
-
end
|
341
|
-
|
342
|
-
### check if .category or .category_data
|
343
|
-
if child['class'] == 'category'
|
344
|
-
|
345
|
-
## collect text for category; exclude element w/ class.category_data
|
346
|
-
text = ""
|
347
|
-
child.children.each do |subchild|
|
348
|
-
text << subchild.text.strip unless subchild.element? && subchild['class'] == 'category_data'
|
349
|
-
end
|
350
|
-
text = cleanup_key( text )
|
351
|
-
|
352
|
-
value = child.css('span.category_data').text.strip
|
353
|
-
|
354
|
-
logger.debug " -- category >>#{text}<<"
|
355
|
-
|
356
|
-
## start new pair
|
357
|
-
last_pair = [ text, value ]
|
358
|
-
last_pair_data_count = 0
|
359
|
-
pairs << last_pair
|
360
|
-
|
361
|
-
elsif child['class'] == 'category_data'
|
362
|
-
logger.debug " -- category_data"
|
363
|
-
|
364
|
-
text = child.text.strip
|
365
|
-
|
366
|
-
if last_pair.nil?
|
367
|
-
## assume its the very first entry; use implied/auto-created category
|
368
|
-
last_pair = [ 'text', '' ]
|
369
|
-
last_pair_data_count = 0
|
370
|
-
pairs << last_pair
|
371
|
-
end
|
372
|
-
|
373
|
-
### first category_data element?
|
374
|
-
if last_pair_data_count == 0
|
375
|
-
if last_pair[1] == ''
|
376
|
-
last_pair[1] = text
|
377
|
-
else
|
378
|
-
last_pair[1] += " #{text}" ## append w/o separator
|
379
|
-
end
|
380
|
-
else
|
381
|
-
if last_cat == 'demographic_profile' ## special case (use space a sep)
|
382
|
-
last_pair[1] += " #{text}" ## append with separator
|
383
|
-
else
|
384
|
-
last_pair[1] += "; #{text}" ## append with separator
|
385
|
-
end
|
386
|
-
end
|
387
|
-
last_pair_data_count += 1
|
388
|
-
|
389
|
-
else
|
390
|
-
logger.warn " **** !!! skipping div w/o category or category_data class:"
|
391
|
-
logger.warn child.to_s
|
392
|
-
end
|
393
|
-
end
|
394
|
-
|
395
|
-
## pp pairs
|
396
|
-
|
397
|
-
## pairs to hash
|
398
|
-
pairs_hash = {}
|
399
|
-
pairs.each do |pair|
|
400
|
-
pairs_hash[ pair[0] ] = pair[1]
|
401
|
-
end
|
402
|
-
|
403
|
-
hash[ last_cat ] = pairs_hash
|
404
|
-
|
405
|
-
else
|
406
|
-
logger.warn "#### !!!! unknown cell type (no field or data id found):"
|
407
|
-
logger.warn cell.to_s
|
408
|
-
end
|
409
|
-
end # each cell
|
410
|
-
|
411
|
-
hash # return hash
|
412
|
-
|
413
|
-
end # method sect_to_hash
|
414
|
-
|
415
272
|
end # class Page
|
416
273
|
|
417
274
|
end # module Factbook
|
@@ -0,0 +1,179 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
module Factbook
|
4
|
+
|
5
|
+
class Sect # section (e.g. Introduction/Geography/People/Economy/Energy/Transport/etc.)
|
6
|
+
include LogUtils::Logging
|
7
|
+
|
8
|
+
attr_reader :title, :html
|
9
|
+
|
10
|
+
def initialize( title, html )
|
11
|
+
## todo: passing a ref to the parent page - why? why not??
|
12
|
+
@title = title
|
13
|
+
@html = html
|
14
|
+
|
15
|
+
@doc = nil
|
16
|
+
@data = nil
|
17
|
+
end
|
18
|
+
|
19
|
+
def doc
|
20
|
+
### check: use nokogiri html fragment? why? why not??
|
21
|
+
@doc ||= Nokogiri::HTML( @html )
|
22
|
+
end
|
23
|
+
|
24
|
+
def data
|
25
|
+
@data ||= sect_to_hash( doc )
|
26
|
+
end
|
27
|
+
|
28
|
+
private
|
29
|
+
|
30
|
+
def cleanup_key( key )
|
31
|
+
## to lower case
|
32
|
+
key = key.downcase
|
33
|
+
## seaport(s) => seaports
|
34
|
+
key = key.gsub( '(s)', 's' )
|
35
|
+
key = key.gsub( ':', '' ) # trailing :
|
36
|
+
## remove special chars ()-/,'
|
37
|
+
key = key.gsub( /['()\-\/,]/, ' ' )
|
38
|
+
key = key.strip
|
39
|
+
key = key.gsub( /[ ]+/, '_' )
|
40
|
+
key
|
41
|
+
end
|
42
|
+
|
43
|
+
|
44
|
+
def sect_to_hash( sect )
|
45
|
+
|
46
|
+
rows = sect.css( 'table tr' )
|
47
|
+
cells = sect.css( 'table tr td' )
|
48
|
+
field_ids = rows.css( '#field' ) ## check - use div#field.category -- possible?
|
49
|
+
data_ids = rows.css( '#data' )
|
50
|
+
|
51
|
+
logger.debug "rows.size: #{rows.size} (cells.size: #{cells.size} / field_ids.size: #{field_ids.size} / data_ids.size: #{data_ids.size})"
|
52
|
+
|
53
|
+
hash = {}
|
54
|
+
last_cat = nil
|
55
|
+
|
56
|
+
cells.each_with_index do |cell,i|
|
57
|
+
## next if i > 14 ## skip after xx for debugging for now
|
58
|
+
|
59
|
+
# check if field or data id
|
60
|
+
# check for (nested) div#field in td
|
61
|
+
has_field_id = cell.css( '#field' ).size == 1 ? true : false
|
62
|
+
|
63
|
+
# check for td#data
|
64
|
+
has_data_id = cell['id'] == 'data' ? true : false
|
65
|
+
|
66
|
+
if has_field_id
|
67
|
+
|
68
|
+
cats = cell.css( 'div.category' ) ## note: ignore all .category not using div (issue warn/err if found!!) etc.
|
69
|
+
if cats.size == 1
|
70
|
+
text = cleanup_key( cats.first.text.strip ) # remove/strip leading and trailing spaces
|
71
|
+
last_cat = text
|
72
|
+
logger.debug " [#{i}] category: >>#{text}<<"
|
73
|
+
else
|
74
|
+
logger.warn "**** !!!!!! warn/err - found element w/ field id (no match for subsection!!! - check)"
|
75
|
+
logger.warn cell.to_s
|
76
|
+
end
|
77
|
+
|
78
|
+
elsif has_data_id
|
79
|
+
|
80
|
+
cats = cell.css( 'div.category' ) ## note: ignore all .category not using div (issue warn/err if found!!) etc.
|
81
|
+
cats_data = cell.css( 'div.category_data,span.category_data' ) ## note: ignore a.category_data etc.
|
82
|
+
cats_div_data = cell.css( 'div.category_data' )
|
83
|
+
cats_span_data = cell.css( 'span.category_data' )
|
84
|
+
|
85
|
+
logger.debug " - [#{i}] data cell - cats: #{cats.size}, cats_data: #{cats_data.size} (cats_div_data: #{cats_div_data.size} / cats_span_data: #{cats_span_data.size})"
|
86
|
+
|
87
|
+
pairs = []
|
88
|
+
last_pair = nil
|
89
|
+
last_pair_data_count = 0
|
90
|
+
|
91
|
+
## loop over div blocks (might be .category or .category_data)
|
92
|
+
cell.children.each_with_index do |child,j|
|
93
|
+
unless child.element?
|
94
|
+
## puts " **** !!!! skipping non-element type >#{child.type}<:"
|
95
|
+
## puts child.to_s
|
96
|
+
next
|
97
|
+
end
|
98
|
+
unless child.name == 'div'
|
99
|
+
logger.warn " **** !!! skipping non-div >#{child.name}<:"
|
100
|
+
logger.warn child.to_s
|
101
|
+
next
|
102
|
+
end
|
103
|
+
|
104
|
+
### check if .category or .category_data
|
105
|
+
if child['class'] == 'category'
|
106
|
+
|
107
|
+
## collect text for category; exclude element w/ class.category_data
|
108
|
+
text = ""
|
109
|
+
child.children.each do |subchild|
|
110
|
+
text << subchild.text.strip unless subchild.element? && subchild['class'] == 'category_data'
|
111
|
+
end
|
112
|
+
text = cleanup_key( text )
|
113
|
+
|
114
|
+
value = child.css('span.category_data').text.strip
|
115
|
+
|
116
|
+
logger.debug " -- category >>#{text}<<"
|
117
|
+
|
118
|
+
## start new pair
|
119
|
+
last_pair = [ text, value ]
|
120
|
+
last_pair_data_count = 0
|
121
|
+
pairs << last_pair
|
122
|
+
|
123
|
+
elsif child['class'] == 'category_data'
|
124
|
+
logger.debug " -- category_data"
|
125
|
+
|
126
|
+
text = child.text.strip
|
127
|
+
|
128
|
+
if last_pair.nil?
|
129
|
+
## assume its the very first entry; use implied/auto-created category
|
130
|
+
last_pair = [ 'text', '' ]
|
131
|
+
last_pair_data_count = 0
|
132
|
+
pairs << last_pair
|
133
|
+
end
|
134
|
+
|
135
|
+
### first category_data element?
|
136
|
+
if last_pair_data_count == 0
|
137
|
+
if last_pair[1] == ''
|
138
|
+
last_pair[1] = text
|
139
|
+
else
|
140
|
+
last_pair[1] += " #{text}" ## append w/o separator
|
141
|
+
end
|
142
|
+
else
|
143
|
+
if last_cat == 'demographic_profile' ## special case (use space a sep)
|
144
|
+
last_pair[1] += " #{text}" ## append with separator
|
145
|
+
else
|
146
|
+
last_pair[1] += "; #{text}" ## append with separator
|
147
|
+
end
|
148
|
+
end
|
149
|
+
last_pair_data_count += 1
|
150
|
+
|
151
|
+
else
|
152
|
+
logger.warn " **** !!! skipping div w/o category or category_data class:"
|
153
|
+
logger.warn child.to_s
|
154
|
+
end
|
155
|
+
end
|
156
|
+
|
157
|
+
## pp pairs
|
158
|
+
|
159
|
+
## pairs to hash
|
160
|
+
pairs_hash = {}
|
161
|
+
pairs.each do |pair|
|
162
|
+
pairs_hash[ pair[0] ] = pair[1]
|
163
|
+
end
|
164
|
+
|
165
|
+
hash[ last_cat ] = pairs_hash
|
166
|
+
|
167
|
+
else
|
168
|
+
logger.warn "#### !!!! unknown cell type (no field or data id found):"
|
169
|
+
logger.warn cell.to_s
|
170
|
+
end
|
171
|
+
end # each cell
|
172
|
+
|
173
|
+
hash # return hash
|
174
|
+
|
175
|
+
end # method sect_to_hash
|
176
|
+
|
177
|
+
end # class Sect
|
178
|
+
|
179
|
+
end # module Factbook
|
data/lib/factbook/version.rb
CHANGED
data/lib/factbook.rb
CHANGED
@@ -7,6 +7,7 @@ require 'uri'
|
|
7
7
|
require 'cgi'
|
8
8
|
require 'pp'
|
9
9
|
require 'json'
|
10
|
+
require 'fileutils'
|
10
11
|
|
11
12
|
|
12
13
|
## 3rd party gems/libs
|
@@ -21,6 +22,7 @@ require 'nokogiri'
|
|
21
22
|
|
22
23
|
require 'factbook/version' # let it always go first
|
23
24
|
require 'factbook/page'
|
25
|
+
require 'factbook/sect'
|
24
26
|
|
25
27
|
|
26
28
|
module Factbook
|