factbook 0.1.1 → 0.1.2

Sign up to get free protection for your applications and to get access to all the features.
data/lib/factbook/page.rb CHANGED
@@ -2,8 +2,8 @@
2
2
 
3
3
  module Factbook
4
4
 
5
- class Page
6
5
 
6
+ class Page
7
7
  include LogUtils::Logging
8
8
 
9
9
  ## standard version
@@ -14,7 +14,15 @@ module Factbook
14
14
  SITE_BASE = 'https://www.cia.gov/library/publications/the-world-factbook/geos/countrytemplate_{code}.html'
15
15
 
16
16
  def initialize( code )
17
- @code = code
17
+ ## note: requires factbook country code
18
+ # e.g. austria is au
19
+ # germany is gm and so on
20
+ @code = code
21
+
22
+ @html = nil
23
+ @doc = nil
24
+ @sects = nil
25
+ @data = nil
18
26
  end
19
27
 
20
28
  def doc
@@ -32,25 +40,13 @@ module Factbook
32
40
 
33
41
  def data
34
42
  if @data.nil?
35
- titles = [
36
- 'intro',
37
- 'geo',
38
- 'people',
39
- 'govt',
40
- 'econ',
41
- 'energy',
42
- 'comm',
43
- 'trans',
44
- 'military',
45
- 'issues' ]
46
-
47
43
  @data = {}
48
44
 
49
45
  sects.each_with_index do |sect,i|
50
46
  logger.debug "############################"
51
- logger.debug "### stats sect #{i}:"
47
+ logger.debug "### [#{i}] stats sect >#{sect.title}<: "
52
48
 
53
- @data[ titles[i] ] = sect_to_hash( sect )
49
+ @data[ sect.title ] = sect.data
54
50
  end
55
51
  end
56
52
  @data
@@ -58,51 +54,60 @@ module Factbook
58
54
 
59
55
 
60
56
  def sects
61
- ## split html into sections
62
- ## to avoid errors w/ nested tags
63
-
64
- divs = [
65
- '<div id="CollapsiblePanel1_Intro"',
66
- '<div id="CollapsiblePanel1_Geo"',
67
- '<div id="CollapsiblePanel1_People"',
68
- '<div id="CollapsiblePanel1_Govt"',
69
- '<div id="CollapsiblePanel1_Econ"',
70
- '<div id="CollapsiblePanel1_Energy"',
71
- '<div id="CollapsiblePanel1_Comm"',
72
- '<div id="CollapsiblePanel1_Trans"',
73
- '<div id="CollapsiblePanel1_Military"',
74
- '<div id="CollapsiblePanel1_Issues"' ]
75
-
76
57
  if @sects.nil?
77
- @sects = []
78
-
79
- @pos = []
80
- divs.each_with_index do |div,i|
58
+ ## split html into sections
59
+ ## lets us avoids errors w/ (wrongly) nested tags
60
+
61
+ divs = [
62
+ [ 'intro', '<div id="CollapsiblePanel1_Intro"' ],
63
+ [ 'geo', '<div id="CollapsiblePanel1_Geo"' ],
64
+ [ 'people', '<div id="CollapsiblePanel1_People"' ],
65
+ [ 'govt', '<div id="CollapsiblePanel1_Govt"' ],
66
+ [ 'econ', '<div id="CollapsiblePanel1_Econ"' ],
67
+ [ 'energy', '<div id="CollapsiblePanel1_Energy"' ],
68
+ [ 'comm', '<div id="CollapsiblePanel1_Comm"' ],
69
+ [ 'trans', '<div id="CollapsiblePanel1_Trans"' ],
70
+ [ 'military', '<div id="CollapsiblePanel1_Military"'],
71
+ [ 'issues', '<div id="CollapsiblePanel1_Issues"' ]
72
+ ]
73
+
74
+ indexes = []
75
+
76
+ ## note:
77
+ ## skip missing sections (w/ warning)
78
+ ## e.g. Vatican (Holy See), Liechtenstein etc. have no Energy section, for example
79
+
80
+ divs.each_with_index do |rec,i|
81
+ title = rec[0]
82
+ div = rec[1]
81
83
  p = html.index( div )
82
84
  if p.nil?
83
- ## issue error: if not found
84
- puts "*** error: section not found -- #{div}"
85
+ ## issue warning: if not found
86
+ logger.warn "***!!! section not found -- #{div} --; skipping"
85
87
  else
86
- puts " found section #{i} @ #{p}"
88
+ logger.debug " found section #{i} @ #{p}"
89
+ indexes << [title,p]
87
90
  end
88
-
89
- @pos << p
90
91
  end
91
- @pos << -1 ## note: last entry add -1 for until the end of document
92
-
93
- divs.each_with_index do |div,i|
94
- from = @pos[i]
95
- to = @pos[i+1]
96
- to -= 1 unless to == -1 ## note: sub one (-1) unless end-of-string (-1)
92
+
93
+ @sects = []
94
+
95
+ indexes.each_with_index do |rec,i|
96
+ title = rec[0]
97
+ from = rec[1]
98
+
99
+ # is last entry? if yes use -1 otherewise pos
100
+ # note: subtract one (-1) from pos unless end-of-string (-1)
101
+ to = indexes[i+1].nil? ? -1 : indexes[i+1][1]-1
97
102
 
98
103
  ## todo: check that from is smaller than to
99
- puts " cut section #{i} [#{from}..#{to}]"
100
- @sects << Nokogiri::HTML( html[ from..to ] )
101
-
102
- if i==0 || i==1
103
- # puts "debug sect #{i}:"
104
- # puts ">>>|||#{html[ from..to ]}|||<<<"
105
- end
104
+ logger.debug " cut section #{i} [#{from}..#{to}]"
105
+ @sects << Sect.new( title, html[ from..to ] )
106
+
107
+ ##if i==0 || i==1
108
+ ## puts "debug sect #{i}:"
109
+ ## puts ">>>|||#{html[ from..to ]}|||<<<"
110
+ ##end
106
111
  end
107
112
  end
108
113
 
@@ -264,154 +269,6 @@ module Factbook
264
269
  end
265
270
  end
266
271
 
267
-
268
- def cleanup_key( key )
269
- ## to lower case
270
- key = key.downcase
271
- ## seaport(s) => seaports
272
- key = key.gsub( '(s)', 's' )
273
- key = key.gsub( ':', '' ) # trailing :
274
- ## remove special chars ()-/,'
275
- key = key.gsub( /['()\-\/,]/, ' ' )
276
- key = key.strip
277
- key = key.gsub( /[ ]+/, '_' )
278
- key
279
- end
280
-
281
-
282
- def sect_to_hash( sect )
283
-
284
- rows = sect.css( 'table tr' )
285
- cells = sect.css( 'table tr td' )
286
- field_ids = rows.css( '#field' ) ## check - use div#field.category -- possible?
287
- data_ids = rows.css( '#data' )
288
-
289
- logger.debug "rows.size: #{rows.size} (cells.size: #{cells.size} / field_ids.size: #{field_ids.size} / data_ids.size: #{data_ids.size})"
290
-
291
- hash = {}
292
- last_cat = nil
293
-
294
- cells.each_with_index do |cell,i|
295
- ## next if i > 14 ## skip after xx for debugging for now
296
-
297
- # check if field or data id
298
- # check for (nested) div#field in td
299
- has_field_id = cell.css( '#field' ).size == 1 ? true : false
300
-
301
- # check for td#data
302
- has_data_id = cell['id'] == 'data' ? true : false
303
-
304
- if has_field_id
305
-
306
- cats = cell.css( 'div.category' ) ## note: ignore all .category not using div (issue warn/err if found!!) etc.
307
- if cats.size == 1
308
- text = cleanup_key( cats.first.text.strip ) # remove/strip leading and trailing spaces
309
- last_cat = text
310
- logger.debug " [#{i}] category: >>#{text}<<"
311
- else
312
- logger.warn "**** !!!!!! warn/err - found element w/ field id (no match for subsection!!! - check)"
313
- logger.warn cell.to_s
314
- end
315
-
316
- elsif has_data_id
317
-
318
- cats = cell.css( 'div.category' ) ## note: ignore all .category not using div (issue warn/err if found!!) etc.
319
- cats_data = cell.css( 'div.category_data,span.category_data' ) ## note: ignore a.category_data etc.
320
- cats_div_data = cell.css( 'div.category_data' )
321
- cats_span_data = cell.css( 'span.category_data' )
322
-
323
- logger.debug " - [#{i}] data cell - cats: #{cats.size}, cats_data: #{cats_data.size} (cats_div_data: #{cats_div_data.size} / cats_span_data: #{cats_span_data.size})"
324
-
325
- pairs = []
326
- last_pair = nil
327
- last_pair_data_count = 0
328
-
329
- ## loop over div blocks (might be .category or .category_data)
330
- cell.children.each_with_index do |child,j|
331
- unless child.element?
332
- ## puts " **** !!!! skipping non-element type >#{child.type}<:"
333
- ## puts child.to_s
334
- next
335
- end
336
- unless child.name == 'div'
337
- logger.warn " **** !!! skipping non-div >#{child.name}<:"
338
- logger.warn child.to_s
339
- next
340
- end
341
-
342
- ### check if .category or .category_data
343
- if child['class'] == 'category'
344
-
345
- ## collect text for category; exclude element w/ class.category_data
346
- text = ""
347
- child.children.each do |subchild|
348
- text << subchild.text.strip unless subchild.element? && subchild['class'] == 'category_data'
349
- end
350
- text = cleanup_key( text )
351
-
352
- value = child.css('span.category_data').text.strip
353
-
354
- logger.debug " -- category >>#{text}<<"
355
-
356
- ## start new pair
357
- last_pair = [ text, value ]
358
- last_pair_data_count = 0
359
- pairs << last_pair
360
-
361
- elsif child['class'] == 'category_data'
362
- logger.debug " -- category_data"
363
-
364
- text = child.text.strip
365
-
366
- if last_pair.nil?
367
- ## assume its the very first entry; use implied/auto-created category
368
- last_pair = [ 'text', '' ]
369
- last_pair_data_count = 0
370
- pairs << last_pair
371
- end
372
-
373
- ### first category_data element?
374
- if last_pair_data_count == 0
375
- if last_pair[1] == ''
376
- last_pair[1] = text
377
- else
378
- last_pair[1] += " #{text}" ## append w/o separator
379
- end
380
- else
381
- if last_cat == 'demographic_profile' ## special case (use space a sep)
382
- last_pair[1] += " #{text}" ## append with separator
383
- else
384
- last_pair[1] += "; #{text}" ## append with separator
385
- end
386
- end
387
- last_pair_data_count += 1
388
-
389
- else
390
- logger.warn " **** !!! skipping div w/o category or category_data class:"
391
- logger.warn child.to_s
392
- end
393
- end
394
-
395
- ## pp pairs
396
-
397
- ## pairs to hash
398
- pairs_hash = {}
399
- pairs.each do |pair|
400
- pairs_hash[ pair[0] ] = pair[1]
401
- end
402
-
403
- hash[ last_cat ] = pairs_hash
404
-
405
- else
406
- logger.warn "#### !!!! unknown cell type (no field or data id found):"
407
- logger.warn cell.to_s
408
- end
409
- end # each cell
410
-
411
- hash # return hash
412
-
413
- end # method sect_to_hash
414
-
415
272
  end # class Page
416
273
 
417
274
  end # module Factbook
@@ -0,0 +1,179 @@
1
+ # encoding: utf-8
2
+
3
+ module Factbook
4
+
5
+ class Sect # section (e.g. Introduction/Geography/People/Economy/Energy/Transport/etc.)
6
+ include LogUtils::Logging
7
+
8
+ attr_reader :title, :html
9
+
10
+ def initialize( title, html )
11
+ ## todo: passing a ref to the parent page - why? why not??
12
+ @title = title
13
+ @html = html
14
+
15
+ @doc = nil
16
+ @data = nil
17
+ end
18
+
19
+ def doc
20
+ ### check: use nokogiri html fragment? why? why not??
21
+ @doc ||= Nokogiri::HTML( @html )
22
+ end
23
+
24
+ def data
25
+ @data ||= sect_to_hash( doc )
26
+ end
27
+
28
+ private
29
+
30
+ def cleanup_key( key )
31
+ ## to lower case
32
+ key = key.downcase
33
+ ## seaport(s) => seaports
34
+ key = key.gsub( '(s)', 's' )
35
+ key = key.gsub( ':', '' ) # trailing :
36
+ ## remove special chars ()-/,'
37
+ key = key.gsub( /['()\-\/,]/, ' ' )
38
+ key = key.strip
39
+ key = key.gsub( /[ ]+/, '_' )
40
+ key
41
+ end
42
+
43
+
44
+ def sect_to_hash( sect )
45
+
46
+ rows = sect.css( 'table tr' )
47
+ cells = sect.css( 'table tr td' )
48
+ field_ids = rows.css( '#field' ) ## check - use div#field.category -- possible?
49
+ data_ids = rows.css( '#data' )
50
+
51
+ logger.debug "rows.size: #{rows.size} (cells.size: #{cells.size} / field_ids.size: #{field_ids.size} / data_ids.size: #{data_ids.size})"
52
+
53
+ hash = {}
54
+ last_cat = nil
55
+
56
+ cells.each_with_index do |cell,i|
57
+ ## next if i > 14 ## skip after xx for debugging for now
58
+
59
+ # check if field or data id
60
+ # check for (nested) div#field in td
61
+ has_field_id = cell.css( '#field' ).size == 1 ? true : false
62
+
63
+ # check for td#data
64
+ has_data_id = cell['id'] == 'data' ? true : false
65
+
66
+ if has_field_id
67
+
68
+ cats = cell.css( 'div.category' ) ## note: ignore all .category not using div (issue warn/err if found!!) etc.
69
+ if cats.size == 1
70
+ text = cleanup_key( cats.first.text.strip ) # remove/strip leading and trailing spaces
71
+ last_cat = text
72
+ logger.debug " [#{i}] category: >>#{text}<<"
73
+ else
74
+ logger.warn "**** !!!!!! warn/err - found element w/ field id (no match for subsection!!! - check)"
75
+ logger.warn cell.to_s
76
+ end
77
+
78
+ elsif has_data_id
79
+
80
+ cats = cell.css( 'div.category' ) ## note: ignore all .category not using div (issue warn/err if found!!) etc.
81
+ cats_data = cell.css( 'div.category_data,span.category_data' ) ## note: ignore a.category_data etc.
82
+ cats_div_data = cell.css( 'div.category_data' )
83
+ cats_span_data = cell.css( 'span.category_data' )
84
+
85
+ logger.debug " - [#{i}] data cell - cats: #{cats.size}, cats_data: #{cats_data.size} (cats_div_data: #{cats_div_data.size} / cats_span_data: #{cats_span_data.size})"
86
+
87
+ pairs = []
88
+ last_pair = nil
89
+ last_pair_data_count = 0
90
+
91
+ ## loop over div blocks (might be .category or .category_data)
92
+ cell.children.each_with_index do |child,j|
93
+ unless child.element?
94
+ ## puts " **** !!!! skipping non-element type >#{child.type}<:"
95
+ ## puts child.to_s
96
+ next
97
+ end
98
+ unless child.name == 'div'
99
+ logger.warn " **** !!! skipping non-div >#{child.name}<:"
100
+ logger.warn child.to_s
101
+ next
102
+ end
103
+
104
+ ### check if .category or .category_data
105
+ if child['class'] == 'category'
106
+
107
+ ## collect text for category; exclude element w/ class.category_data
108
+ text = ""
109
+ child.children.each do |subchild|
110
+ text << subchild.text.strip unless subchild.element? && subchild['class'] == 'category_data'
111
+ end
112
+ text = cleanup_key( text )
113
+
114
+ value = child.css('span.category_data').text.strip
115
+
116
+ logger.debug " -- category >>#{text}<<"
117
+
118
+ ## start new pair
119
+ last_pair = [ text, value ]
120
+ last_pair_data_count = 0
121
+ pairs << last_pair
122
+
123
+ elsif child['class'] == 'category_data'
124
+ logger.debug " -- category_data"
125
+
126
+ text = child.text.strip
127
+
128
+ if last_pair.nil?
129
+ ## assume its the very first entry; use implied/auto-created category
130
+ last_pair = [ 'text', '' ]
131
+ last_pair_data_count = 0
132
+ pairs << last_pair
133
+ end
134
+
135
+ ### first category_data element?
136
+ if last_pair_data_count == 0
137
+ if last_pair[1] == ''
138
+ last_pair[1] = text
139
+ else
140
+ last_pair[1] += " #{text}" ## append w/o separator
141
+ end
142
+ else
143
+ if last_cat == 'demographic_profile' ## special case (use space a sep)
144
+ last_pair[1] += " #{text}" ## append with separator
145
+ else
146
+ last_pair[1] += "; #{text}" ## append with separator
147
+ end
148
+ end
149
+ last_pair_data_count += 1
150
+
151
+ else
152
+ logger.warn " **** !!! skipping div w/o category or category_data class:"
153
+ logger.warn child.to_s
154
+ end
155
+ end
156
+
157
+ ## pp pairs
158
+
159
+ ## pairs to hash
160
+ pairs_hash = {}
161
+ pairs.each do |pair|
162
+ pairs_hash[ pair[0] ] = pair[1]
163
+ end
164
+
165
+ hash[ last_cat ] = pairs_hash
166
+
167
+ else
168
+ logger.warn "#### !!!! unknown cell type (no field or data id found):"
169
+ logger.warn cell.to_s
170
+ end
171
+ end # each cell
172
+
173
+ hash # return hash
174
+
175
+ end # method sect_to_hash
176
+
177
+ end # class Sect
178
+
179
+ end # module Factbook
@@ -1,5 +1,5 @@
1
1
 
2
2
  module Factbook
3
- VERSION = '0.1.1'
3
+ VERSION = '0.1.2'
4
4
  end
5
5
 
data/lib/factbook.rb CHANGED
@@ -7,6 +7,7 @@ require 'uri'
7
7
  require 'cgi'
8
8
  require 'pp'
9
9
  require 'json'
10
+ require 'fileutils'
10
11
 
11
12
 
12
13
  ## 3rd party gems/libs
@@ -21,6 +22,7 @@ require 'nokogiri'
21
22
 
22
23
  require 'factbook/version' # let it always go first
23
24
  require 'factbook/page'
25
+ require 'factbook/sect'
24
26
 
25
27
 
26
28
  module Factbook