factbook 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/factbook/page.rb CHANGED
@@ -2,8 +2,8 @@
2
2
 
3
3
  module Factbook
4
4
 
5
- class Page
6
5
 
6
+ class Page
7
7
  include LogUtils::Logging
8
8
 
9
9
  ## standard version
@@ -14,7 +14,15 @@ module Factbook
14
14
  SITE_BASE = 'https://www.cia.gov/library/publications/the-world-factbook/geos/countrytemplate_{code}.html'
15
15
 
16
16
  def initialize( code )
17
- @code = code
17
+ ## note: requires factbook country code
18
+ # e.g. austria is au
19
+ # germany is gm and so on
20
+ @code = code
21
+
22
+ @html = nil
23
+ @doc = nil
24
+ @sects = nil
25
+ @data = nil
18
26
  end
19
27
 
20
28
  def doc
@@ -32,25 +40,13 @@ module Factbook
32
40
 
33
41
  def data
34
42
  if @data.nil?
35
- titles = [
36
- 'intro',
37
- 'geo',
38
- 'people',
39
- 'govt',
40
- 'econ',
41
- 'energy',
42
- 'comm',
43
- 'trans',
44
- 'military',
45
- 'issues' ]
46
-
47
43
  @data = {}
48
44
 
49
45
  sects.each_with_index do |sect,i|
50
46
  logger.debug "############################"
51
- logger.debug "### stats sect #{i}:"
47
+ logger.debug "### [#{i}] stats sect >#{sect.title}<: "
52
48
 
53
- @data[ titles[i] ] = sect_to_hash( sect )
49
+ @data[ sect.title ] = sect.data
54
50
  end
55
51
  end
56
52
  @data
@@ -58,51 +54,60 @@ module Factbook
58
54
 
59
55
 
60
56
  def sects
61
- ## split html into sections
62
- ## to avoid errors w/ nested tags
63
-
64
- divs = [
65
- '<div id="CollapsiblePanel1_Intro"',
66
- '<div id="CollapsiblePanel1_Geo"',
67
- '<div id="CollapsiblePanel1_People"',
68
- '<div id="CollapsiblePanel1_Govt"',
69
- '<div id="CollapsiblePanel1_Econ"',
70
- '<div id="CollapsiblePanel1_Energy"',
71
- '<div id="CollapsiblePanel1_Comm"',
72
- '<div id="CollapsiblePanel1_Trans"',
73
- '<div id="CollapsiblePanel1_Military"',
74
- '<div id="CollapsiblePanel1_Issues"' ]
75
-
76
57
  if @sects.nil?
77
- @sects = []
78
-
79
- @pos = []
80
- divs.each_with_index do |div,i|
58
+ ## split html into sections
59
+ ## lets us avoids errors w/ (wrongly) nested tags
60
+
61
+ divs = [
62
+ [ 'intro', '<div id="CollapsiblePanel1_Intro"' ],
63
+ [ 'geo', '<div id="CollapsiblePanel1_Geo"' ],
64
+ [ 'people', '<div id="CollapsiblePanel1_People"' ],
65
+ [ 'govt', '<div id="CollapsiblePanel1_Govt"' ],
66
+ [ 'econ', '<div id="CollapsiblePanel1_Econ"' ],
67
+ [ 'energy', '<div id="CollapsiblePanel1_Energy"' ],
68
+ [ 'comm', '<div id="CollapsiblePanel1_Comm"' ],
69
+ [ 'trans', '<div id="CollapsiblePanel1_Trans"' ],
70
+ [ 'military', '<div id="CollapsiblePanel1_Military"'],
71
+ [ 'issues', '<div id="CollapsiblePanel1_Issues"' ]
72
+ ]
73
+
74
+ indexes = []
75
+
76
+ ## note:
77
+ ## skip missing sections (w/ warning)
78
+ ## e.g. Vatican (Holy See), Liechtenstein etc. have no Energy section, for example
79
+
80
+ divs.each_with_index do |rec,i|
81
+ title = rec[0]
82
+ div = rec[1]
81
83
  p = html.index( div )
82
84
  if p.nil?
83
- ## issue error: if not found
84
- puts "*** error: section not found -- #{div}"
85
+ ## issue warning: if not found
86
+ logger.warn "***!!! section not found -- #{div} --; skipping"
85
87
  else
86
- puts " found section #{i} @ #{p}"
88
+ logger.debug " found section #{i} @ #{p}"
89
+ indexes << [title,p]
87
90
  end
88
-
89
- @pos << p
90
91
  end
91
- @pos << -1 ## note: last entry add -1 for until the end of document
92
-
93
- divs.each_with_index do |div,i|
94
- from = @pos[i]
95
- to = @pos[i+1]
96
- to -= 1 unless to == -1 ## note: sub one (-1) unless end-of-string (-1)
92
+
93
+ @sects = []
94
+
95
+ indexes.each_with_index do |rec,i|
96
+ title = rec[0]
97
+ from = rec[1]
98
+
99
+ # is last entry? if yes use -1 otherewise pos
100
+ # note: subtract one (-1) from pos unless end-of-string (-1)
101
+ to = indexes[i+1].nil? ? -1 : indexes[i+1][1]-1
97
102
 
98
103
  ## todo: check that from is smaller than to
99
- puts " cut section #{i} [#{from}..#{to}]"
100
- @sects << Nokogiri::HTML( html[ from..to ] )
101
-
102
- if i==0 || i==1
103
- # puts "debug sect #{i}:"
104
- # puts ">>>|||#{html[ from..to ]}|||<<<"
105
- end
104
+ logger.debug " cut section #{i} [#{from}..#{to}]"
105
+ @sects << Sect.new( title, html[ from..to ] )
106
+
107
+ ##if i==0 || i==1
108
+ ## puts "debug sect #{i}:"
109
+ ## puts ">>>|||#{html[ from..to ]}|||<<<"
110
+ ##end
106
111
  end
107
112
  end
108
113
 
@@ -264,154 +269,6 @@ module Factbook
264
269
  end
265
270
  end
266
271
 
267
-
268
- def cleanup_key( key )
269
- ## to lower case
270
- key = key.downcase
271
- ## seaport(s) => seaports
272
- key = key.gsub( '(s)', 's' )
273
- key = key.gsub( ':', '' ) # trailing :
274
- ## remove special chars ()-/,'
275
- key = key.gsub( /['()\-\/,]/, ' ' )
276
- key = key.strip
277
- key = key.gsub( /[ ]+/, '_' )
278
- key
279
- end
280
-
281
-
282
- def sect_to_hash( sect )
283
-
284
- rows = sect.css( 'table tr' )
285
- cells = sect.css( 'table tr td' )
286
- field_ids = rows.css( '#field' ) ## check - use div#field.category -- possible?
287
- data_ids = rows.css( '#data' )
288
-
289
- logger.debug "rows.size: #{rows.size} (cells.size: #{cells.size} / field_ids.size: #{field_ids.size} / data_ids.size: #{data_ids.size})"
290
-
291
- hash = {}
292
- last_cat = nil
293
-
294
- cells.each_with_index do |cell,i|
295
- ## next if i > 14 ## skip after xx for debugging for now
296
-
297
- # check if field or data id
298
- # check for (nested) div#field in td
299
- has_field_id = cell.css( '#field' ).size == 1 ? true : false
300
-
301
- # check for td#data
302
- has_data_id = cell['id'] == 'data' ? true : false
303
-
304
- if has_field_id
305
-
306
- cats = cell.css( 'div.category' ) ## note: ignore all .category not using div (issue warn/err if found!!) etc.
307
- if cats.size == 1
308
- text = cleanup_key( cats.first.text.strip ) # remove/strip leading and trailing spaces
309
- last_cat = text
310
- logger.debug " [#{i}] category: >>#{text}<<"
311
- else
312
- logger.warn "**** !!!!!! warn/err - found element w/ field id (no match for subsection!!! - check)"
313
- logger.warn cell.to_s
314
- end
315
-
316
- elsif has_data_id
317
-
318
- cats = cell.css( 'div.category' ) ## note: ignore all .category not using div (issue warn/err if found!!) etc.
319
- cats_data = cell.css( 'div.category_data,span.category_data' ) ## note: ignore a.category_data etc.
320
- cats_div_data = cell.css( 'div.category_data' )
321
- cats_span_data = cell.css( 'span.category_data' )
322
-
323
- logger.debug " - [#{i}] data cell - cats: #{cats.size}, cats_data: #{cats_data.size} (cats_div_data: #{cats_div_data.size} / cats_span_data: #{cats_span_data.size})"
324
-
325
- pairs = []
326
- last_pair = nil
327
- last_pair_data_count = 0
328
-
329
- ## loop over div blocks (might be .category or .category_data)
330
- cell.children.each_with_index do |child,j|
331
- unless child.element?
332
- ## puts " **** !!!! skipping non-element type >#{child.type}<:"
333
- ## puts child.to_s
334
- next
335
- end
336
- unless child.name == 'div'
337
- logger.warn " **** !!! skipping non-div >#{child.name}<:"
338
- logger.warn child.to_s
339
- next
340
- end
341
-
342
- ### check if .category or .category_data
343
- if child['class'] == 'category'
344
-
345
- ## collect text for category; exclude element w/ class.category_data
346
- text = ""
347
- child.children.each do |subchild|
348
- text << subchild.text.strip unless subchild.element? && subchild['class'] == 'category_data'
349
- end
350
- text = cleanup_key( text )
351
-
352
- value = child.css('span.category_data').text.strip
353
-
354
- logger.debug " -- category >>#{text}<<"
355
-
356
- ## start new pair
357
- last_pair = [ text, value ]
358
- last_pair_data_count = 0
359
- pairs << last_pair
360
-
361
- elsif child['class'] == 'category_data'
362
- logger.debug " -- category_data"
363
-
364
- text = child.text.strip
365
-
366
- if last_pair.nil?
367
- ## assume its the very first entry; use implied/auto-created category
368
- last_pair = [ 'text', '' ]
369
- last_pair_data_count = 0
370
- pairs << last_pair
371
- end
372
-
373
- ### first category_data element?
374
- if last_pair_data_count == 0
375
- if last_pair[1] == ''
376
- last_pair[1] = text
377
- else
378
- last_pair[1] += " #{text}" ## append w/o separator
379
- end
380
- else
381
- if last_cat == 'demographic_profile' ## special case (use space a sep)
382
- last_pair[1] += " #{text}" ## append with separator
383
- else
384
- last_pair[1] += "; #{text}" ## append with separator
385
- end
386
- end
387
- last_pair_data_count += 1
388
-
389
- else
390
- logger.warn " **** !!! skipping div w/o category or category_data class:"
391
- logger.warn child.to_s
392
- end
393
- end
394
-
395
- ## pp pairs
396
-
397
- ## pairs to hash
398
- pairs_hash = {}
399
- pairs.each do |pair|
400
- pairs_hash[ pair[0] ] = pair[1]
401
- end
402
-
403
- hash[ last_cat ] = pairs_hash
404
-
405
- else
406
- logger.warn "#### !!!! unknown cell type (no field or data id found):"
407
- logger.warn cell.to_s
408
- end
409
- end # each cell
410
-
411
- hash # return hash
412
-
413
- end # method sect_to_hash
414
-
415
272
  end # class Page
416
273
 
417
274
  end # module Factbook
@@ -0,0 +1,179 @@
1
+ # encoding: utf-8
2
+
3
+ module Factbook
4
+
5
+ class Sect # section (e.g. Introduction/Geography/People/Economy/Energy/Transport/etc.)
6
+ include LogUtils::Logging
7
+
8
+ attr_reader :title, :html
9
+
10
+ def initialize( title, html )
11
+ ## todo: passing a ref to the parent page - why? why not??
12
+ @title = title
13
+ @html = html
14
+
15
+ @doc = nil
16
+ @data = nil
17
+ end
18
+
19
+ def doc
20
+ ### check: use nokogiri html fragment? why? why not??
21
+ @doc ||= Nokogiri::HTML( @html )
22
+ end
23
+
24
+ def data
25
+ @data ||= sect_to_hash( doc )
26
+ end
27
+
28
+ private
29
+
30
+ def cleanup_key( key )
31
+ ## to lower case
32
+ key = key.downcase
33
+ ## seaport(s) => seaports
34
+ key = key.gsub( '(s)', 's' )
35
+ key = key.gsub( ':', '' ) # trailing :
36
+ ## remove special chars ()-/,'
37
+ key = key.gsub( /['()\-\/,]/, ' ' )
38
+ key = key.strip
39
+ key = key.gsub( /[ ]+/, '_' )
40
+ key
41
+ end
42
+
43
+
44
+ def sect_to_hash( sect )
45
+
46
+ rows = sect.css( 'table tr' )
47
+ cells = sect.css( 'table tr td' )
48
+ field_ids = rows.css( '#field' ) ## check - use div#field.category -- possible?
49
+ data_ids = rows.css( '#data' )
50
+
51
+ logger.debug "rows.size: #{rows.size} (cells.size: #{cells.size} / field_ids.size: #{field_ids.size} / data_ids.size: #{data_ids.size})"
52
+
53
+ hash = {}
54
+ last_cat = nil
55
+
56
+ cells.each_with_index do |cell,i|
57
+ ## next if i > 14 ## skip after xx for debugging for now
58
+
59
+ # check if field or data id
60
+ # check for (nested) div#field in td
61
+ has_field_id = cell.css( '#field' ).size == 1 ? true : false
62
+
63
+ # check for td#data
64
+ has_data_id = cell['id'] == 'data' ? true : false
65
+
66
+ if has_field_id
67
+
68
+ cats = cell.css( 'div.category' ) ## note: ignore all .category not using div (issue warn/err if found!!) etc.
69
+ if cats.size == 1
70
+ text = cleanup_key( cats.first.text.strip ) # remove/strip leading and trailing spaces
71
+ last_cat = text
72
+ logger.debug " [#{i}] category: >>#{text}<<"
73
+ else
74
+ logger.warn "**** !!!!!! warn/err - found element w/ field id (no match for subsection!!! - check)"
75
+ logger.warn cell.to_s
76
+ end
77
+
78
+ elsif has_data_id
79
+
80
+ cats = cell.css( 'div.category' ) ## note: ignore all .category not using div (issue warn/err if found!!) etc.
81
+ cats_data = cell.css( 'div.category_data,span.category_data' ) ## note: ignore a.category_data etc.
82
+ cats_div_data = cell.css( 'div.category_data' )
83
+ cats_span_data = cell.css( 'span.category_data' )
84
+
85
+ logger.debug " - [#{i}] data cell - cats: #{cats.size}, cats_data: #{cats_data.size} (cats_div_data: #{cats_div_data.size} / cats_span_data: #{cats_span_data.size})"
86
+
87
+ pairs = []
88
+ last_pair = nil
89
+ last_pair_data_count = 0
90
+
91
+ ## loop over div blocks (might be .category or .category_data)
92
+ cell.children.each_with_index do |child,j|
93
+ unless child.element?
94
+ ## puts " **** !!!! skipping non-element type >#{child.type}<:"
95
+ ## puts child.to_s
96
+ next
97
+ end
98
+ unless child.name == 'div'
99
+ logger.warn " **** !!! skipping non-div >#{child.name}<:"
100
+ logger.warn child.to_s
101
+ next
102
+ end
103
+
104
+ ### check if .category or .category_data
105
+ if child['class'] == 'category'
106
+
107
+ ## collect text for category; exclude element w/ class.category_data
108
+ text = ""
109
+ child.children.each do |subchild|
110
+ text << subchild.text.strip unless subchild.element? && subchild['class'] == 'category_data'
111
+ end
112
+ text = cleanup_key( text )
113
+
114
+ value = child.css('span.category_data').text.strip
115
+
116
+ logger.debug " -- category >>#{text}<<"
117
+
118
+ ## start new pair
119
+ last_pair = [ text, value ]
120
+ last_pair_data_count = 0
121
+ pairs << last_pair
122
+
123
+ elsif child['class'] == 'category_data'
124
+ logger.debug " -- category_data"
125
+
126
+ text = child.text.strip
127
+
128
+ if last_pair.nil?
129
+ ## assume its the very first entry; use implied/auto-created category
130
+ last_pair = [ 'text', '' ]
131
+ last_pair_data_count = 0
132
+ pairs << last_pair
133
+ end
134
+
135
+ ### first category_data element?
136
+ if last_pair_data_count == 0
137
+ if last_pair[1] == ''
138
+ last_pair[1] = text
139
+ else
140
+ last_pair[1] += " #{text}" ## append w/o separator
141
+ end
142
+ else
143
+ if last_cat == 'demographic_profile' ## special case (use space a sep)
144
+ last_pair[1] += " #{text}" ## append with separator
145
+ else
146
+ last_pair[1] += "; #{text}" ## append with separator
147
+ end
148
+ end
149
+ last_pair_data_count += 1
150
+
151
+ else
152
+ logger.warn " **** !!! skipping div w/o category or category_data class:"
153
+ logger.warn child.to_s
154
+ end
155
+ end
156
+
157
+ ## pp pairs
158
+
159
+ ## pairs to hash
160
+ pairs_hash = {}
161
+ pairs.each do |pair|
162
+ pairs_hash[ pair[0] ] = pair[1]
163
+ end
164
+
165
+ hash[ last_cat ] = pairs_hash
166
+
167
+ else
168
+ logger.warn "#### !!!! unknown cell type (no field or data id found):"
169
+ logger.warn cell.to_s
170
+ end
171
+ end # each cell
172
+
173
+ hash # return hash
174
+
175
+ end # method sect_to_hash
176
+
177
+ end # class Sect
178
+
179
+ end # module Factbook
@@ -1,5 +1,5 @@
1
1
 
2
2
  module Factbook
3
- VERSION = '0.1.1'
3
+ VERSION = '0.1.2'
4
4
  end
5
5
 
data/lib/factbook.rb CHANGED
@@ -7,6 +7,7 @@ require 'uri'
7
7
  require 'cgi'
8
8
  require 'pp'
9
9
  require 'json'
10
+ require 'fileutils'
10
11
 
11
12
 
12
13
  ## 3rd party gems/libs
@@ -21,6 +22,7 @@ require 'nokogiri'
21
22
 
22
23
  require 'factbook/version' # let it always go first
23
24
  require 'factbook/page'
25
+ require 'factbook/sect'
24
26
 
25
27
 
26
28
  module Factbook