factbook-readers 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (58) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +3 -0
  3. data/Manifest.txt +56 -0
  4. data/README.md +196 -0
  5. data/Rakefile +34 -0
  6. data/data/attributes.yml +337 -0
  7. data/data/categories.csv +164 -0
  8. data/data/codes.csv +262 -0
  9. data/data/codesxref.csv +280 -0
  10. data/data/comparisons.csv +75 -0
  11. data/lib/factbook-readers.rb +59 -0
  12. data/lib/factbook-readers/attributes.rb +74 -0
  13. data/lib/factbook-readers/builder.rb +212 -0
  14. data/lib/factbook-readers/builder_item.rb +185 -0
  15. data/lib/factbook-readers/builder_json.rb +79 -0
  16. data/lib/factbook-readers/codes.rb +122 -0
  17. data/lib/factbook-readers/comparisons.rb +50 -0
  18. data/lib/factbook-readers/counter.rb +48 -0
  19. data/lib/factbook-readers/normalize.rb +43 -0
  20. data/lib/factbook-readers/page.rb +148 -0
  21. data/lib/factbook-readers/page_info.rb +12 -0
  22. data/lib/factbook-readers/reader_json.rb +51 -0
  23. data/lib/factbook-readers/sanitizer.rb +307 -0
  24. data/lib/factbook-readers/sect.rb +29 -0
  25. data/lib/factbook-readers/subsect.rb +18 -0
  26. data/lib/factbook-readers/table.rb +52 -0
  27. data/lib/factbook-readers/utils.rb +47 -0
  28. data/lib/factbook-readers/utils_info.rb +129 -0
  29. data/lib/factbook-readers/version.rb +24 -0
  30. data/lib/factbook/readers.rb +5 -0
  31. data/test/data/au.html +579 -0
  32. data/test/data/au.yml +8 -0
  33. data/test/data/be.html +596 -0
  34. data/test/data/be.yml +8 -0
  35. data/test/data/json/au.json +892 -0
  36. data/test/data/src/ag.html +716 -0
  37. data/test/data/src/au-2015-09-24.html +2006 -0
  38. data/test/data/src/au.html +658 -0
  39. data/test/data/src/be-2015-09-24.html +2011 -0
  40. data/test/data/src/be.html +648 -0
  41. data/test/helper.rb +11 -0
  42. data/test/test_attribs.rb +87 -0
  43. data/test/test_attribs_def.rb +20 -0
  44. data/test/test_builder.rb +35 -0
  45. data/test/test_codes.rb +76 -0
  46. data/test/test_comparisons.rb +19 -0
  47. data/test/test_convert.rb +30 -0
  48. data/test/test_counter.rb +31 -0
  49. data/test/test_fields.rb +52 -0
  50. data/test/test_importer.rb +56 -0
  51. data/test/test_item_builder.rb +99 -0
  52. data/test/test_json.rb +45 -0
  53. data/test/test_json_builder.rb +25 -0
  54. data/test/test_normalize.rb +23 -0
  55. data/test/test_page.rb +38 -0
  56. data/test/test_sanitizer.rb +39 -0
  57. data/test/test_sanitizer_regex.rb +89 -0
  58. metadata +196 -0
@@ -0,0 +1,75 @@
1
+ Num,Category,Name
2
+ 2147,Geography,Area
3
+ 2119,People and Society,Population
4
+ 2002,People and Society,Population growth rate
5
+ 2054,People and Society,Birth rate
6
+ 2066,People and Society,Death rate
7
+ 2112,People and Society,Net migration rate
8
+ 2223,People and Society,Maternal mortality rate
9
+ 2091,People and Society,Infant mortality rate
10
+ 2102,People and Society,Life expectancy at birth
11
+ 2127,People and Society,Total fertility rate
12
+ 2225,People and Society,Health expenditures
13
+ 2155,People and Society,HIV/AIDS - adult prevalence rate
14
+ 2156,People and Society,HIV/AIDS - people living with HIV/AIDS
15
+ 2157,People and Society,HIV/AIDS - deaths
16
+ 2228,People and Society,Obesity - adult prevalence rate
17
+ 2224,People and Society,Children under the age of 5 years underweight
18
+ 2206,People and Society,Education expenditures
19
+ 2229,People and Society,"Unemployment, youth ages 15-24"
20
+ 2001,Economy,GDP (purchasing power parity)
21
+ 2003,Economy,GDP - real growth rate
22
+ 2004,Economy,GDP - per capita (PPP)
23
+ 2260,Economy,Gross national saving
24
+ 2089,Economy,Industrial production growth rate
25
+ 2095,Economy,Labor force
26
+ 2129,Economy,Unemployment rate
27
+ 2172,Economy,Distribution of family income - Gini index
28
+ 2221,Economy,Taxes and other revenues
29
+ 2222,Economy,Budget surplus (+) or deficit (-)
30
+ 2186,Economy,Public debt
31
+ 2092,Economy,Inflation rate (consumer prices)
32
+ 2207,Economy,Central bank discount rate
33
+ 2208,Economy,Commercial bank prime lending rate
34
+ 2214,Economy,Stock of narrow money
35
+ 2215,Economy,Stock of broad money
36
+ 2211,Economy,Stock of domestic credit
37
+ 2200,Economy,Market value of publicly traded shares
38
+ 2187,Economy,Current account balance
39
+ 2078,Economy,Exports
40
+ 2087,Economy,Imports
41
+ 2188,Economy,Reserves of foreign exchange and gold
42
+ 2079,Economy,Debt - external
43
+ 2198,Economy,Stock of direct foreign investment - at home
44
+ 2199,Economy,Stock of direct foreign investment - abroad
45
+ 2232,Energy,Electricity - production
46
+ 2233,Energy,Electricity - consumption
47
+ 2234,Energy,Electricity - exports
48
+ 2235,Energy,Electricity - imports
49
+ 2236,Energy,Electricity - installed generating capacity
50
+ 2237,Energy,Electricity - from fossil fuels
51
+ 2239,Energy,Electricity - from nuclear fuels
52
+ 2238,Energy,Electricity - from hydroelectric plants
53
+ 2240,Energy,Electricity - from other renewable sources
54
+ 2241,Energy,Crude oil - production
55
+ 2242,Energy,Crude oil - exports
56
+ 2243,Energy,Crude oil - imports
57
+ 2244,Energy,Crude oil - proved reserves
58
+ 2245,Energy,Refined petroleum products - production
59
+ 2246,Energy,Refined petroleum products - consumption
60
+ 2247,Energy,Refined petroleum products - exports
61
+ 2248,Energy,Refined petroleum products - imports
62
+ 2249,Energy,Natural gas - production
63
+ 2250,Energy,Natural gas - consumption
64
+ 2251,Energy,Natural gas - exports
65
+ 2252,Energy,Natural gas - imports
66
+ 2253,Energy,Natural gas - proved reserves
67
+ 2150,Communications,Telephones - fixed lines
68
+ 2151,Communications,Telephones - mobile cellular
69
+ 2153,Communications,Internet users
70
+ 2053,Transportation,Airports
71
+ 2121,Transportation,Railways
72
+ 2085,Transportation,Roadways
73
+ 2093,Transportation,Waterways
74
+ 2108,Transportation,Merchant marine
75
+ 2034,Military,Military expenditures
@@ -0,0 +1,59 @@
1
+ ## 3rd party gems/libs
2
+ ## require 'props'
3
+
4
+ require 'logutils'
5
+ require 'webget'
6
+ require 'csvreader'
7
+
8
+
9
+ require 'nokogiri'
10
+
11
+
12
+
13
+
14
+ # our own code
15
+ require 'factbook-readers/version' # let it always go first
16
+
17
+
18
+ require 'factbook-readers/codes'
19
+ require 'factbook-readers/comparisons'
20
+ require 'factbook-readers/attributes'
21
+
22
+ module Factbook
23
+
24
+ ## auto-load builtin codes, comparisons, attributes, etc.
25
+ CODES = Codes.from_csv( "#{Factbook::Module::Readers.root}/data/codes.csv" )
26
+ COMPARISONS = Comparisons.from_csv( "#{Factbook::Module::Readers.root}/data/comparisons.csv" )
27
+ ATTRIBUTES = Attributes.from_yaml( "#{Factbook::Module::Readers.root}/data/attributes.yml" )
28
+
29
+ def self.codes() CODES; end
30
+ def self.comparisons() COMPARISONS; end
31
+ def self.attributes() ATTRIBUTES; end
32
+
33
+ end # module Factbook
34
+
35
+ ## note: make codes, comparisons, attributes available
36
+
37
+ require 'factbook-readers/utils'
38
+ require 'factbook-readers/utils_info'
39
+ require 'factbook-readers/sanitizer'
40
+ require 'factbook-readers/normalize'
41
+ require 'factbook-readers/builder_item'
42
+ require 'factbook-readers/builder'
43
+ require 'factbook-readers/builder_json'
44
+ require 'factbook-readers/page'
45
+ require 'factbook-readers/page_info'
46
+ require 'factbook-readers/sect'
47
+ require 'factbook-readers/subsect'
48
+
49
+
50
+ require 'factbook-readers/reader_json'
51
+
52
+ require 'factbook-readers/table' ## e.g. TableReader
53
+
54
+ require 'factbook-readers/counter'
55
+
56
+
57
+
58
+
59
+ puts Factbook::Module::Readers.banner
@@ -0,0 +1,74 @@
1
+ # encoding: utf-8
2
+
3
+
4
+ module Factbook
5
+
6
+ class Attributes
7
+
8
+ Attribute = Struct.new( :name,
9
+ :category, ## e.g. Introduction, Geography, etc.
10
+ :path, ## note: is an array e.g. ["Area - comparative"] or ["Area", "land"] etc.
11
+ )
12
+
13
+ def self.from_yaml( path )
14
+
15
+ h = YAML.load_file( path )
16
+ pp h
17
+
18
+ attribs = []
19
+
20
+ ## note: use a copy (e.g. h.dup) for now (hash gets changed by build_attribs!!)
21
+ new_h = h.dup
22
+ new_h.each do |k,v|
23
+ category = k
24
+ build_attribs( attribs, category, [], v )
25
+ end
26
+
27
+ self.new( attribs )
28
+ end
29
+
30
+
31
+ def self.build_attribs( attribs, category, path, h )
32
+
33
+ ## assume it's an attribute definition hash
34
+ ## note: !! exclude special cases:
35
+ ## Capital -- incl. name key itself
36
+ ## National anthem
37
+ if h.has_key?( 'name' ) && ['Capital','National anthem'].include?( path[-1] ) == false
38
+ a = Attribute.new
39
+ a.name = h['name']
40
+ a.category = category
41
+ a.path = path
42
+
43
+ puts " adding attribute >#{a.name}< using #{a.category} / #{a.path.inspect}"
44
+ attribs << a
45
+
46
+ ## note: make sure a modifable copy (of h) gets passed in
47
+ h.delete( 'name' )
48
+ end
49
+
50
+ return if h.empty? ## empty hash; nothing (more) to do; return
51
+
52
+ ## continue walking (recursive)
53
+ h.each do |k,v|
54
+ new_path = path.dup << k ## note: create a new array (copy)
55
+ build_attribs( attribs, category, new_path, v )
56
+ end
57
+ end
58
+
59
+
60
+ def initialize( attribs )
61
+ @attribs = attribs
62
+ end
63
+
64
+ def to_a() @attribs; end
65
+ def size() @attribs.size; end
66
+
67
+ def each
68
+ @attribs.each { |attrib| yield( attrib ) }
69
+ end
70
+
71
+ end # class Attributes
72
+
73
+ end # module Factbook
74
+
@@ -0,0 +1,212 @@
1
+ # encoding: utf-8
2
+
3
+ module Factbook
4
+
5
+ class Builder ## todo: change to PageBuilder ???
6
+ include LogUtils::Logging
7
+
8
+
9
+ =begin
10
+ def self.from_cc( cc, opts={} ) ## rename to from_file_for_country() or from_file_for_cc() or something - why?? why not??
11
+ ## check/todo: rename input_dir to just dir or to include ?
12
+ ## (there's no output_dir)?? - why? why not?
13
+ input_dir = opts[:input_dir] || '.'
14
+ self.from_file( "#{input_dir}/#{cc}.html" )
15
+ end
16
+ =end
17
+
18
+
19
+ def self.from_file( path )
20
+ html_ascii = File.read( path ) ## fix/todo: use ASCII8BIT/binary reader !!!!!
21
+ self.from_string( html_ascii )
22
+ end
23
+
24
+ def self.from_string( html_ascii ) ## note: expects ASCII-7BIT/BINARY encoding
25
+ self.new( html_ascii )
26
+ end
27
+
28
+
29
+ attr_reader :html_ascii, ## full "original" 1:1 page in "original/ascii8/binary" encoding
30
+ :html, ## utf-8 encoded profile
31
+ :html_debug, ## html w/ mapping markers - rename to html_markers - why? why not?
32
+ :info, ## page info incl. country_name, region_name, last_updated etc.
33
+ :errors, ## encoding erros etc.
34
+ :sects
35
+
36
+
37
+ def initialize( html_ascii )
38
+ @html_ascii = html_ascii
39
+
40
+ ## todo/fix: use/assume windows 12xx?? encoding - change encoding to utf-8 (from binary/ascii8bit)
41
+ @html, @info, @errors = Sanitizer.new.sanitize( @html_ascii )
42
+
43
+
44
+ html_sects = if @html.empty?
45
+ ## note: support "empty" pages - old format waiting for update!!!
46
+ ## cannot parse for now
47
+ [] ## return empty (no) sections for now - sorry (its just one page with code cc anyway!!)
48
+ else
49
+ @html_debug = map_sects( @html )
50
+ @html_debug = map_subsects( @html_debug )
51
+
52
+ split_sects( @html_debug )
53
+ end
54
+
55
+ pp html_sects
56
+
57
+ ## debug
58
+ ## File.open( 'tmp/br.debug.html', 'w:utf-8') { |f| f.write( @html_debug ) }
59
+
60
+
61
+ @sects = []
62
+ html_sects.each do |html_sect|
63
+ html_sect_head = html_sect[0]
64
+ html_subsects = html_sect[1]
65
+ puts html_sect_head
66
+ puts html_subsects.size
67
+
68
+ ## get section title
69
+ ## @SECTION{Economy} => Economy
70
+ if html_sect_head =~ /@SECTION{(.+?)}/
71
+ title = $1.strip
72
+ puts title
73
+ sect = Sect.new
74
+ sect.title = title
75
+ ## get subsections
76
+ subsects = []
77
+ html_subsects.each do |html_subsect|
78
+ html_subsect_head = html_subsect[0]
79
+ html_subsect_body = html_subsect[1]
80
+ if html_subsect_head =~ /@SUBSECTION{(.+?)}/
81
+ title = $1.strip
82
+ title = title.sub( /:\z/, '' ) # remove trailing : if present
83
+ title = title.strip
84
+
85
+ puts title
86
+ subsect = Subsect.new
87
+ subsect.title = title ## todo/fix: cut off trailing colon (:)
88
+
89
+ b = Factbook::ItemBuilder.new( html_subsect_body, title )
90
+ h = b.read
91
+ subsect.data = h
92
+
93
+ subsects << subsect
94
+ else
95
+ ## warn/fix: no subsection title found
96
+ end
97
+ end
98
+ sect.subsects = subsects
99
+ @sects << sect
100
+ else
101
+ ## warn/fix: no section title found
102
+ end
103
+ end
104
+
105
+ self ## return self -- needed?? default (standard) anyway?? check and remove
106
+ end
107
+
108
+
109
+
110
+ def map_sects( html )
111
+ ## convert section titles to "unified" marker
112
+ ## e.g.
113
+ ## <h2>Introduction</h2>
114
+
115
+ title_regex= /<h2>
116
+ \s*
117
+ (.+?) ## note: use non-greedy; do NOT allow tags inside for now
118
+ \s*
119
+ <\/h2>
120
+ /xim
121
+
122
+ html = html.gsub( title_regex ) do |m|
123
+ puts "** found section >#{$1}<:"
124
+ puts " >|#{m}|<"
125
+
126
+ "\n\n@SECTION{#{$1}}\n\n"
127
+ end
128
+ html
129
+ end
130
+
131
+
132
+ def map_subsects( html )
133
+ ## convert subsection titles to "unified" marker
134
+ ## e.g.
135
+ ## <h3>Disputes - international:</h3>
136
+
137
+ title_regex= /<h3>
138
+ \s*
139
+ (.+?) ## note: use non-greedy; allows tags inside - why? why not
140
+ \s*
141
+ <\/h3>
142
+ /xim
143
+
144
+ html = html.gsub( title_regex ) do |m|
145
+ puts "** found subsection >#{$1}<:"
146
+ puts " >|#{m}|<"
147
+
148
+ "\n@SUBSECTION{#{$1}}\n"
149
+ end
150
+ html
151
+ end
152
+
153
+
154
+
155
+ def split_sects( html )
156
+ ####
157
+ # split html in sections (divided by section headings)
158
+ # e.g. remove optional prolog ??,
159
+ ## [[heading,sect],
160
+ ## [heading,sect],
161
+ ## [heading,sect],...]
162
+
163
+ ## note: "wrap" regex in a capture group (just one)
164
+ ## String#split will include all catpure groups in the result array
165
+
166
+ section_regex= /(@SECTION{.+?})/ ## note: use non-greedy -- check: need to escape {} ??
167
+
168
+ chunks = html.split( section_regex )
169
+
170
+ ## check if first item is a section or (html) prolog
171
+ # if prolog (remove)
172
+ chunks.slice!(0) unless chunks[0] =~ /@SECTION/ ## starts w/ @SECTION
173
+
174
+ pairs = chunks.each_slice(2).to_a
175
+
176
+ ## now split subsections
177
+ newpairs = []
178
+ pairs.each do |item|
179
+ ## todo: after cleanup prolog; remove @SECTION{} ?? - just keep title - why, why not??
180
+ newpairs << [item[0], split_subsects( item[1]) ]
181
+ end
182
+ newpairs
183
+ end
184
+
185
+
186
+ def split_subsects( html )
187
+ ####
188
+ # split html in subsections (divided by subsection headings)
189
+ # e.g. remove optional prolog ??,
190
+ ## [[heading,sect],
191
+ ## [heading,sect],
192
+ ## [heading,sect],...]
193
+
194
+ ## note: "wrap" regex in a capture group (just one)
195
+ ## String#split will include all catpure groups in the result array
196
+
197
+ subsection_regex= /(@SUBSECTION{.+?})/ ## note: use non-greedy -- check: need to escape {} ??
198
+
199
+ chunks = html.split( subsection_regex )
200
+
201
+ ## check if first item is a section or (html) prolog
202
+ # if prolog (remove)
203
+ chunks.slice!(0) unless chunks[0] =~ /@SUBSECTION/ ## starts w/ @SUBSECTION
204
+
205
+ pairs = chunks.each_slice(2).to_a
206
+ pairs
207
+ end
208
+
209
+ end # class Builder
210
+
211
+
212
+ end # module Factbook
@@ -0,0 +1,185 @@
1
+ # encoding: utf-8
2
+
3
+ module Factbook
4
+
5
+ class ItemBuilder ## renameto ItemReader, ItemParser - why? why not??
6
+ include LogUtils::Logging
7
+ include NormalizeHelper ## e.g. normalize_category
8
+
9
+ def initialize( html, name )
10
+ @html = html
11
+ @name = name # add category/field name e.g. Area, Location, etc.
12
+ end
13
+
14
+
15
+
16
+ ##
17
+ ## <div class="category_data subfield text">
18
+ ## Portuguese (official and most widely spoken language)
19
+ ##
20
+ ## </div>
21
+ ## <div class="category_data note">
22
+ ## <p><strong>note:</strong> less common languages include Spanish (border areas and schools), German, Italian, Japanese, English, and a large number of minor Amerindian languages</p>
23
+ ## </div>
24
+
25
+
26
+ def read
27
+ ## return hash from html snippet
28
+ doc = Nokogiri::HTML.fragment( @html )
29
+
30
+ data = {}
31
+
32
+ ## note:
33
+ ## skip whitespace text nodes (e.g. \n\n etc); just use divs
34
+ doc_children = doc.children.filter('div')
35
+
36
+ puts " parsing >#{@name}< - #{doc_children.size} category_data divs(s):"
37
+
38
+ ## hanlde special case for
39
+ ## multiple 'grouped_subfield' first
40
+ ## e.g. used in
41
+ ## - Drinking water source:
42
+ ## - Sanitation facility access:
43
+
44
+ grouped_children = []
45
+ other_children = []
46
+
47
+ doc_children.each do |div|
48
+ if div['class'].index( 'grouped_subfield' )
49
+ grouped_children << div
50
+ else
51
+ other_children << div
52
+ end
53
+ end
54
+
55
+
56
+ ## note: only use special rule if more than one div marked grouped_
57
+ if grouped_children.size > 1
58
+ ## continue processing the rest as usual
59
+ doc_children = other_children
60
+
61
+ key = nil
62
+ grouped_children.each do |div|
63
+ if !div.css( 'span.subfield-group').empty?
64
+ # start a new group
65
+ span_group = div.at( 'span.subfield-group')
66
+ key = normalize_category( span_group.text.strip )
67
+ span_group.replace( '' )
68
+
69
+ text = squish( div.text.strip )
70
+ puts "new group - category_data key >#{key}<: >#{text}<"
71
+ data[ key ] = { 'text' => text }
72
+ else
73
+ ## append to (last) group
74
+ text = squish( div.text.strip )
75
+ puts "add group - category_data key >#{key}<: >#{text}<"
76
+ data[ key ]['text'] += " / #{text}"
77
+ end
78
+ end
79
+ end
80
+
81
+
82
+ doc_children.each_with_index do |div,i|
83
+ if div['class'].index( 'note' )
84
+ text = squish( div.text.strip )
85
+ puts "category_data: >#{text}<"
86
+
87
+ ## note: for now only allow one note per subsection/field data block
88
+ if data['note']
89
+ puts "!! ERROR: note already taken:"
90
+ puts data['note']
91
+ puts div.to_html
92
+ exit 1
93
+ end
94
+
95
+ data['note'] = { 'text' => text }
96
+ elsif div['class'].index( 'historic' )
97
+ ## add all historic together into one for now
98
+ text = squish( div.text.strip )
99
+ puts "category_data: >#{text}<"
100
+
101
+ if data['text']
102
+ ## append with / for now
103
+ data['text'] += " / #{text}"
104
+ else
105
+ data['text'] = text
106
+ ## check if history is first node
107
+ if i != 0
108
+ puts "!! ERROR: expected first historic node to be first node but it is #{i+1}:"
109
+ puts div.to_html
110
+ exit 1
111
+ end
112
+ end
113
+ elsif div.css( 'span.subfield-name').empty?
114
+ ## assume "implied text field"
115
+ ## check for index == 1 / child count == 1 - why? why not
116
+ text = squish( div.text.strip ) ## fix/todo: use strip
117
+ puts "category_data: >#{text}<"
118
+
119
+ data['text'] = text
120
+
121
+ ## must be always first node for now
122
+ if i != 0
123
+ puts "!! ERROR - 'implied' category W/O name NOT first div / node:"
124
+ puts div.to_html
125
+ exit 1
126
+ end
127
+ elsif div['class'].index( 'grouped_subfield' )
128
+ ## split grouped subfield!!
129
+ ## <span class="subfield-name">arable land:</span>
130
+ ## <span class="subfield-number">8.6%</span>
131
+ ## <span class="subfield-date">(2011 est.)</span>
132
+ ## /
133
+ ## <span class="subfield-name">permanent crops:</span>
134
+ ## <span class="subfield-number">0.8%</span>
135
+ ## <span class="subfield-date">(2011 est.)</span>
136
+ ## /
137
+ ## <span class="subfield-name">permanent pasture:</span>
138
+ ## <span class="subfield-number">23.5%</span>
139
+ ## <span class="subfield-date">(2011 est.)</span>
140
+
141
+ ## join names for now - why? why not?
142
+ ## e.g. becomes:
143
+ ## arable land / permanent crops / permanent pasture: for key ??
144
+ span_names = div.css( 'span.subfield-name')
145
+ keys = []
146
+ span_names.each do |span|
147
+ keys << normalize_category( span.text.strip )
148
+ span.replace( '' )
149
+ end
150
+ key = keys.join( ' / ')
151
+ text = squish( div.text.strip )
152
+ puts "category_data key >#{key}<: >#{text}<"
153
+ data[ key ] = { 'text' => text }
154
+ else
155
+ ## get subfield name
156
+ span_names = div.css( 'span.subfield-name')
157
+ if span_names.size > 1
158
+ puts "!! ERROR - found more than one subfield-name:"
159
+ puts div.to_html
160
+ exit 1
161
+ end
162
+ key = normalize_category( span_names[0].text.strip )
163
+ span_names[0].replace( '' )
164
+
165
+ text = squish( div.text.strip )
166
+ puts "category_data key >#{key}<: >#{text}<"
167
+ data[ key ] = { 'text' => text }
168
+ end
169
+ end
170
+
171
+
172
+ pp data
173
+ data
174
+ end
175
+
176
+
177
+
178
+
179
+ def squish( str )
180
+ str.gsub( /[ \t\n\r]{2,}/, ' ') ## replace multi-spaces (incl. newlines with once space)
181
+ end
182
+
183
+ end # class ItemBuilder
184
+
185
+ end # module Factbook