factbook-readers 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +3 -0
  3. data/Manifest.txt +56 -0
  4. data/README.md +196 -0
  5. data/Rakefile +34 -0
  6. data/data/attributes.yml +337 -0
  7. data/data/categories.csv +164 -0
  8. data/data/codes.csv +262 -0
  9. data/data/codesxref.csv +280 -0
  10. data/data/comparisons.csv +75 -0
  11. data/lib/factbook-readers.rb +59 -0
  12. data/lib/factbook-readers/attributes.rb +74 -0
  13. data/lib/factbook-readers/builder.rb +212 -0
  14. data/lib/factbook-readers/builder_item.rb +185 -0
  15. data/lib/factbook-readers/builder_json.rb +79 -0
  16. data/lib/factbook-readers/codes.rb +122 -0
  17. data/lib/factbook-readers/comparisons.rb +50 -0
  18. data/lib/factbook-readers/counter.rb +48 -0
  19. data/lib/factbook-readers/normalize.rb +43 -0
  20. data/lib/factbook-readers/page.rb +148 -0
  21. data/lib/factbook-readers/page_info.rb +12 -0
  22. data/lib/factbook-readers/reader_json.rb +51 -0
  23. data/lib/factbook-readers/sanitizer.rb +307 -0
  24. data/lib/factbook-readers/sect.rb +29 -0
  25. data/lib/factbook-readers/subsect.rb +18 -0
  26. data/lib/factbook-readers/table.rb +52 -0
  27. data/lib/factbook-readers/utils.rb +47 -0
  28. data/lib/factbook-readers/utils_info.rb +129 -0
  29. data/lib/factbook-readers/version.rb +24 -0
  30. data/lib/factbook/readers.rb +5 -0
  31. data/test/data/au.html +579 -0
  32. data/test/data/au.yml +8 -0
  33. data/test/data/be.html +596 -0
  34. data/test/data/be.yml +8 -0
  35. data/test/data/json/au.json +892 -0
  36. data/test/data/src/ag.html +716 -0
  37. data/test/data/src/au-2015-09-24.html +2006 -0
  38. data/test/data/src/au.html +658 -0
  39. data/test/data/src/be-2015-09-24.html +2011 -0
  40. data/test/data/src/be.html +648 -0
  41. data/test/helper.rb +11 -0
  42. data/test/test_attribs.rb +87 -0
  43. data/test/test_attribs_def.rb +20 -0
  44. data/test/test_builder.rb +35 -0
  45. data/test/test_codes.rb +76 -0
  46. data/test/test_comparisons.rb +19 -0
  47. data/test/test_convert.rb +30 -0
  48. data/test/test_counter.rb +31 -0
  49. data/test/test_fields.rb +52 -0
  50. data/test/test_importer.rb +56 -0
  51. data/test/test_item_builder.rb +99 -0
  52. data/test/test_json.rb +45 -0
  53. data/test/test_json_builder.rb +25 -0
  54. data/test/test_normalize.rb +23 -0
  55. data/test/test_page.rb +38 -0
  56. data/test/test_sanitizer.rb +39 -0
  57. data/test/test_sanitizer_regex.rb +89 -0
  58. metadata +196 -0
@@ -0,0 +1,75 @@
1
+ Num,Category,Name
2
+ 2147,Geography,Area
3
+ 2119,People and Society,Population
4
+ 2002,People and Society,Population growth rate
5
+ 2054,People and Society,Birth rate
6
+ 2066,People and Society,Death rate
7
+ 2112,People and Society,Net migration rate
8
+ 2223,People and Society,Maternal mortality rate
9
+ 2091,People and Society,Infant mortality rate
10
+ 2102,People and Society,Life expectancy at birth
11
+ 2127,People and Society,Total fertility rate
12
+ 2225,People and Society,Health expenditures
13
+ 2155,People and Society,HIV/AIDS - adult prevalence rate
14
+ 2156,People and Society,HIV/AIDS - people living with HIV/AIDS
15
+ 2157,People and Society,HIV/AIDS - deaths
16
+ 2228,People and Society,Obesity - adult prevalence rate
17
+ 2224,People and Society,Children under the age of 5 years underweight
18
+ 2206,People and Society,Education expenditures
19
+ 2229,People and Society,"Unemployment, youth ages 15-24"
20
+ 2001,Economy,GDP (purchasing power parity)
21
+ 2003,Economy,GDP - real growth rate
22
+ 2004,Economy,GDP - per capita (PPP)
23
+ 2260,Economy,Gross national saving
24
+ 2089,Economy,Industrial production growth rate
25
+ 2095,Economy,Labor force
26
+ 2129,Economy,Unemployment rate
27
+ 2172,Economy,Distribution of family income - Gini index
28
+ 2221,Economy,Taxes and other revenues
29
+ 2222,Economy,Budget surplus (+) or deficit (-)
30
+ 2186,Economy,Public debt
31
+ 2092,Economy,Inflation rate (consumer prices)
32
+ 2207,Economy,Central bank discount rate
33
+ 2208,Economy,Commercial bank prime lending rate
34
+ 2214,Economy,Stock of narrow money
35
+ 2215,Economy,Stock of broad money
36
+ 2211,Economy,Stock of domestic credit
37
+ 2200,Economy,Market value of publicly traded shares
38
+ 2187,Economy,Current account balance
39
+ 2078,Economy,Exports
40
+ 2087,Economy,Imports
41
+ 2188,Economy,Reserves of foreign exchange and gold
42
+ 2079,Economy,Debt - external
43
+ 2198,Economy,Stock of direct foreign investment - at home
44
+ 2199,Economy,Stock of direct foreign investment - abroad
45
+ 2232,Energy,Electricity - production
46
+ 2233,Energy,Electricity - consumption
47
+ 2234,Energy,Electricity - exports
48
+ 2235,Energy,Electricity - imports
49
+ 2236,Energy,Electricity - installed generating capacity
50
+ 2237,Energy,Electricity - from fossil fuels
51
+ 2239,Energy,Electricity - from nuclear fuels
52
+ 2238,Energy,Electricity - from hydroelectric plants
53
+ 2240,Energy,Electricity - from other renewable sources
54
+ 2241,Energy,Crude oil - production
55
+ 2242,Energy,Crude oil - exports
56
+ 2243,Energy,Crude oil - imports
57
+ 2244,Energy,Crude oil - proved reserves
58
+ 2245,Energy,Refined petroleum products - production
59
+ 2246,Energy,Refined petroleum products - consumption
60
+ 2247,Energy,Refined petroleum products - exports
61
+ 2248,Energy,Refined petroleum products - imports
62
+ 2249,Energy,Natural gas - production
63
+ 2250,Energy,Natural gas - consumption
64
+ 2251,Energy,Natural gas - exports
65
+ 2252,Energy,Natural gas - imports
66
+ 2253,Energy,Natural gas - proved reserves
67
+ 2150,Communications,Telephones - fixed lines
68
+ 2151,Communications,Telephones - mobile cellular
69
+ 2153,Communications,Internet users
70
+ 2053,Transportation,Airports
71
+ 2121,Transportation,Railways
72
+ 2085,Transportation,Roadways
73
+ 2093,Transportation,Waterways
74
+ 2108,Transportation,Merchant marine
75
+ 2034,Military,Military expenditures
@@ -0,0 +1,59 @@
1
+ ## 3rd party gems/libs
2
+ ## require 'props'
3
+
4
+ require 'logutils'
5
+ require 'webget'
6
+ require 'csvreader'
7
+
8
+
9
+ require 'nokogiri'
10
+
11
+
12
+
13
+
14
+ # our own code
15
+ require 'factbook-readers/version' # let it always go first
16
+
17
+
18
+ require 'factbook-readers/codes'
19
+ require 'factbook-readers/comparisons'
20
+ require 'factbook-readers/attributes'
21
+
22
+ module Factbook
23
+
24
+ ## auto-load builtin codes, comparisons, attributes, etc.
25
+ CODES = Codes.from_csv( "#{Factbook::Module::Readers.root}/data/codes.csv" )
26
+ COMPARISONS = Comparisons.from_csv( "#{Factbook::Module::Readers.root}/data/comparisons.csv" )
27
+ ATTRIBUTES = Attributes.from_yaml( "#{Factbook::Module::Readers.root}/data/attributes.yml" )
28
+
29
+ def self.codes() CODES; end
30
+ def self.comparisons() COMPARISONS; end
31
+ def self.attributes() ATTRIBUTES; end
32
+
33
+ end # module Factbook
34
+
35
+ ## note: make codes, comparisons, attributes available
36
+
37
+ require 'factbook-readers/utils'
38
+ require 'factbook-readers/utils_info'
39
+ require 'factbook-readers/sanitizer'
40
+ require 'factbook-readers/normalize'
41
+ require 'factbook-readers/builder_item'
42
+ require 'factbook-readers/builder'
43
+ require 'factbook-readers/builder_json'
44
+ require 'factbook-readers/page'
45
+ require 'factbook-readers/page_info'
46
+ require 'factbook-readers/sect'
47
+ require 'factbook-readers/subsect'
48
+
49
+
50
+ require 'factbook-readers/reader_json'
51
+
52
+ require 'factbook-readers/table' ## e.g. TableReader
53
+
54
+ require 'factbook-readers/counter'
55
+
56
+
57
+
58
+
59
+ puts Factbook::Module::Readers.banner
@@ -0,0 +1,74 @@
1
+ # encoding: utf-8
2
+
3
+
4
+ module Factbook
5
+
6
+ class Attributes
7
+
8
+ Attribute = Struct.new( :name,
9
+ :category, ## e.g. Introduction, Geography, etc.
10
+ :path, ## note: is an array e.g. ["Area - comparative"] or ["Area", "land"] etc.
11
+ )
12
+
13
+ def self.from_yaml( path )
14
+
15
+ h = YAML.load_file( path )
16
+ pp h
17
+
18
+ attribs = []
19
+
20
+ ## note: use a copy (e.g. h.dup) for now (hash gets changed by build_attribs!!)
21
+ new_h = h.dup
22
+ new_h.each do |k,v|
23
+ category = k
24
+ build_attribs( attribs, category, [], v )
25
+ end
26
+
27
+ self.new( attribs )
28
+ end
29
+
30
+
31
+ def self.build_attribs( attribs, category, path, h )
32
+
33
+ ## assume it's an attribute definition hash
34
+ ## note: !! exclude special cases:
35
+ ## Capital -- incl. name key itself
36
+ ## National anthem
37
+ if h.has_key?( 'name' ) && ['Capital','National anthem'].include?( path[-1] ) == false
38
+ a = Attribute.new
39
+ a.name = h['name']
40
+ a.category = category
41
+ a.path = path
42
+
43
+ puts " adding attribute >#{a.name}< using #{a.category} / #{a.path.inspect}"
44
+ attribs << a
45
+
46
+ ## note: make sure a modifable copy (of h) gets passed in
47
+ h.delete( 'name' )
48
+ end
49
+
50
+ return if h.empty? ## empty hash; nothing (more) to do; return
51
+
52
+ ## continue walking (recursive)
53
+ h.each do |k,v|
54
+ new_path = path.dup << k ## note: create a new array (copy)
55
+ build_attribs( attribs, category, new_path, v )
56
+ end
57
+ end
58
+
59
+
60
+ def initialize( attribs )
61
+ @attribs = attribs
62
+ end
63
+
64
+ def to_a() @attribs; end
65
+ def size() @attribs.size; end
66
+
67
+ def each
68
+ @attribs.each { |attrib| yield( attrib ) }
69
+ end
70
+
71
+ end # class Attributes
72
+
73
+ end # module Factbook
74
+
@@ -0,0 +1,212 @@
1
+ # encoding: utf-8
2
+
3
+ module Factbook
4
+
5
+ class Builder ## todo: change to PageBuilder ???
6
+ include LogUtils::Logging
7
+
8
+
9
+ =begin
10
+ def self.from_cc( cc, opts={} ) ## rename to from_file_for_country() or from_file_for_cc() or something - why?? why not??
11
+ ## check/todo: rename input_dir to just dir or to include ?
12
+ ## (there's no output_dir)?? - why? why not?
13
+ input_dir = opts[:input_dir] || '.'
14
+ self.from_file( "#{input_dir}/#{cc}.html" )
15
+ end
16
+ =end
17
+
18
+
19
+ def self.from_file( path )
20
+ html_ascii = File.read( path ) ## fix/todo: use ASCII8BIT/binary reader !!!!!
21
+ self.from_string( html_ascii )
22
+ end
23
+
24
+ def self.from_string( html_ascii ) ## note: expects ASCII-7BIT/BINARY encoding
25
+ self.new( html_ascii )
26
+ end
27
+
28
+
29
+ attr_reader :html_ascii, ## full "original" 1:1 page in "original/ascii8/binary" encoding
30
+ :html, ## utf-8 encoded profile
31
+ :html_debug, ## html w/ mapping markers - rename to html_markers - why? why not?
32
+ :info, ## page info incl. country_name, region_name, last_updated etc.
33
+ :errors, ## encoding erros etc.
34
+ :sects
35
+
36
+
37
+ def initialize( html_ascii )
38
+ @html_ascii = html_ascii
39
+
40
+ ## todo/fix: use/assume windows 12xx?? encoding - change encoding to utf-8 (from binary/ascii8bit)
41
+ @html, @info, @errors = Sanitizer.new.sanitize( @html_ascii )
42
+
43
+
44
+ html_sects = if @html.empty?
45
+ ## note: support "empty" pages - old format waiting for update!!!
46
+ ## cannot parse for now
47
+ [] ## return empty (no) sections for now - sorry (its just one page with code cc anyway!!)
48
+ else
49
+ @html_debug = map_sects( @html )
50
+ @html_debug = map_subsects( @html_debug )
51
+
52
+ split_sects( @html_debug )
53
+ end
54
+
55
+ pp html_sects
56
+
57
+ ## debug
58
+ ## File.open( 'tmp/br.debug.html', 'w:utf-8') { |f| f.write( @html_debug ) }
59
+
60
+
61
+ @sects = []
62
+ html_sects.each do |html_sect|
63
+ html_sect_head = html_sect[0]
64
+ html_subsects = html_sect[1]
65
+ puts html_sect_head
66
+ puts html_subsects.size
67
+
68
+ ## get section title
69
+ ## @SECTION{Economy} => Economy
70
+ if html_sect_head =~ /@SECTION{(.+?)}/
71
+ title = $1.strip
72
+ puts title
73
+ sect = Sect.new
74
+ sect.title = title
75
+ ## get subsections
76
+ subsects = []
77
+ html_subsects.each do |html_subsect|
78
+ html_subsect_head = html_subsect[0]
79
+ html_subsect_body = html_subsect[1]
80
+ if html_subsect_head =~ /@SUBSECTION{(.+?)}/
81
+ title = $1.strip
82
+ title = title.sub( /:\z/, '' ) # remove trailing : if present
83
+ title = title.strip
84
+
85
+ puts title
86
+ subsect = Subsect.new
87
+ subsect.title = title ## todo/fix: cut off trailing colon (:)
88
+
89
+ b = Factbook::ItemBuilder.new( html_subsect_body, title )
90
+ h = b.read
91
+ subsect.data = h
92
+
93
+ subsects << subsect
94
+ else
95
+ ## warn/fix: no subsection title found
96
+ end
97
+ end
98
+ sect.subsects = subsects
99
+ @sects << sect
100
+ else
101
+ ## warn/fix: no section title found
102
+ end
103
+ end
104
+
105
+ self ## return self -- needed?? default (standard) anyway?? check and remove
106
+ end
107
+
108
+
109
+
110
+ def map_sects( html )
111
+ ## convert section titles to "unified" marker
112
+ ## e.g.
113
+ ## <h2>Introduction</h2>
114
+
115
+ title_regex= /<h2>
116
+ \s*
117
+ (.+?) ## note: use non-greedy; do NOT allow tags inside for now
118
+ \s*
119
+ <\/h2>
120
+ /xim
121
+
122
+ html = html.gsub( title_regex ) do |m|
123
+ puts "** found section >#{$1}<:"
124
+ puts " >|#{m}|<"
125
+
126
+ "\n\n@SECTION{#{$1}}\n\n"
127
+ end
128
+ html
129
+ end
130
+
131
+
132
+ def map_subsects( html )
133
+ ## convert subsection titles to "unified" marker
134
+ ## e.g.
135
+ ## <h3>Disputes - international:</h3>
136
+
137
+ title_regex= /<h3>
138
+ \s*
139
+ (.+?) ## note: use non-greedy; allows tags inside - why? why not
140
+ \s*
141
+ <\/h3>
142
+ /xim
143
+
144
+ html = html.gsub( title_regex ) do |m|
145
+ puts "** found subsection >#{$1}<:"
146
+ puts " >|#{m}|<"
147
+
148
+ "\n@SUBSECTION{#{$1}}\n"
149
+ end
150
+ html
151
+ end
152
+
153
+
154
+
155
+ def split_sects( html )
156
+ ####
157
+ # split html in sections (divided by section headings)
158
+ # e.g. remove optional prolog ??,
159
+ ## [[heading,sect],
160
+ ## [heading,sect],
161
+ ## [heading,sect],...]
162
+
163
+ ## note: "wrap" regex in a capture group (just one)
164
+ ## String#split will include all catpure groups in the result array
165
+
166
+ section_regex= /(@SECTION{.+?})/ ## note: use non-greedy -- check: need to escape {} ??
167
+
168
+ chunks = html.split( section_regex )
169
+
170
+ ## check if first item is a section or (html) prolog
171
+ # if prolog (remove)
172
+ chunks.slice!(0) unless chunks[0] =~ /@SECTION/ ## starts w/ @SECTION
173
+
174
+ pairs = chunks.each_slice(2).to_a
175
+
176
+ ## now split subsections
177
+ newpairs = []
178
+ pairs.each do |item|
179
+ ## todo: after cleanup prolog; remove @SECTION{} ?? - just keep title - why, why not??
180
+ newpairs << [item[0], split_subsects( item[1]) ]
181
+ end
182
+ newpairs
183
+ end
184
+
185
+
186
+ def split_subsects( html )
187
+ ####
188
+ # split html in subsections (divided by subsection headings)
189
+ # e.g. remove optional prolog ??,
190
+ ## [[heading,sect],
191
+ ## [heading,sect],
192
+ ## [heading,sect],...]
193
+
194
+ ## note: "wrap" regex in a capture group (just one)
195
+ ## String#split will include all catpure groups in the result array
196
+
197
+ subsection_regex= /(@SUBSECTION{.+?})/ ## note: use non-greedy -- check: need to escape {} ??
198
+
199
+ chunks = html.split( subsection_regex )
200
+
201
+ ## check if first item is a section or (html) prolog
202
+ # if prolog (remove)
203
+ chunks.slice!(0) unless chunks[0] =~ /@SUBSECTION/ ## starts w/ @SUBSECTION
204
+
205
+ pairs = chunks.each_slice(2).to_a
206
+ pairs
207
+ end
208
+
209
+ end # class Builder
210
+
211
+
212
+ end # module Factbook
@@ -0,0 +1,185 @@
1
+ # encoding: utf-8
2
+
3
+ module Factbook
4
+
5
+ class ItemBuilder ## renameto ItemReader, ItemParser - why? why not??
6
+ include LogUtils::Logging
7
+ include NormalizeHelper ## e.g. normalize_category
8
+
9
+ def initialize( html, name )
10
+ @html = html
11
+ @name = name # add category/field name e.g. Area, Location, etc.
12
+ end
13
+
14
+
15
+
16
+ ##
17
+ ## <div class="category_data subfield text">
18
+ ## Portuguese (official and most widely spoken language)
19
+ ##
20
+ ## </div>
21
+ ## <div class="category_data note">
22
+ ## <p><strong>note:</strong> less common languages include Spanish (border areas and schools), German, Italian, Japanese, English, and a large number of minor Amerindian languages</p>
23
+ ## </div>
24
+
25
+
26
+ def read
27
+ ## return hash from html snippet
28
+ doc = Nokogiri::HTML.fragment( @html )
29
+
30
+ data = {}
31
+
32
+ ## note:
33
+ ## skip whitespace text nodes (e.g. \n\n etc); just use divs
34
+ doc_children = doc.children.filter('div')
35
+
36
+ puts " parsing >#{@name}< - #{doc_children.size} category_data divs(s):"
37
+
38
+ ## hanlde special case for
39
+ ## multiple 'grouped_subfield' first
40
+ ## e.g. used in
41
+ ## - Drinking water source:
42
+ ## - Sanitation facility access:
43
+
44
+ grouped_children = []
45
+ other_children = []
46
+
47
+ doc_children.each do |div|
48
+ if div['class'].index( 'grouped_subfield' )
49
+ grouped_children << div
50
+ else
51
+ other_children << div
52
+ end
53
+ end
54
+
55
+
56
+ ## note: only use special rule if more than one div marked grouped_
57
+ if grouped_children.size > 1
58
+ ## continue processing the rest as usual
59
+ doc_children = other_children
60
+
61
+ key = nil
62
+ grouped_children.each do |div|
63
+ if !div.css( 'span.subfield-group').empty?
64
+ # start a new group
65
+ span_group = div.at( 'span.subfield-group')
66
+ key = normalize_category( span_group.text.strip )
67
+ span_group.replace( '' )
68
+
69
+ text = squish( div.text.strip )
70
+ puts "new group - category_data key >#{key}<: >#{text}<"
71
+ data[ key ] = { 'text' => text }
72
+ else
73
+ ## append to (last) group
74
+ text = squish( div.text.strip )
75
+ puts "add group - category_data key >#{key}<: >#{text}<"
76
+ data[ key ]['text'] += " / #{text}"
77
+ end
78
+ end
79
+ end
80
+
81
+
82
+ doc_children.each_with_index do |div,i|
83
+ if div['class'].index( 'note' )
84
+ text = squish( div.text.strip )
85
+ puts "category_data: >#{text}<"
86
+
87
+ ## note: for now only allow one note per subsection/field data block
88
+ if data['note']
89
+ puts "!! ERROR: note already taken:"
90
+ puts data['note']
91
+ puts div.to_html
92
+ exit 1
93
+ end
94
+
95
+ data['note'] = { 'text' => text }
96
+ elsif div['class'].index( 'historic' )
97
+ ## add all historic together into one for now
98
+ text = squish( div.text.strip )
99
+ puts "category_data: >#{text}<"
100
+
101
+ if data['text']
102
+ ## append with / for now
103
+ data['text'] += " / #{text}"
104
+ else
105
+ data['text'] = text
106
+ ## check if history is first node
107
+ if i != 0
108
+ puts "!! ERROR: expected first historic node to be first node but it is #{i+1}:"
109
+ puts div.to_html
110
+ exit 1
111
+ end
112
+ end
113
+ elsif div.css( 'span.subfield-name').empty?
114
+ ## assume "implied text field"
115
+ ## check for index == 1 / child count == 1 - why? why not
116
+ text = squish( div.text.strip ) ## fix/todo: use strip
117
+ puts "category_data: >#{text}<"
118
+
119
+ data['text'] = text
120
+
121
+ ## must be always first node for now
122
+ if i != 0
123
+ puts "!! ERROR - 'implied' category W/O name NOT first div / node:"
124
+ puts div.to_html
125
+ exit 1
126
+ end
127
+ elsif div['class'].index( 'grouped_subfield' )
128
+ ## split grouped subfield!!
129
+ ## <span class="subfield-name">arable land:</span>
130
+ ## <span class="subfield-number">8.6%</span>
131
+ ## <span class="subfield-date">(2011 est.)</span>
132
+ ## /
133
+ ## <span class="subfield-name">permanent crops:</span>
134
+ ## <span class="subfield-number">0.8%</span>
135
+ ## <span class="subfield-date">(2011 est.)</span>
136
+ ## /
137
+ ## <span class="subfield-name">permanent pasture:</span>
138
+ ## <span class="subfield-number">23.5%</span>
139
+ ## <span class="subfield-date">(2011 est.)</span>
140
+
141
+ ## join names for now - why? why not?
142
+ ## e.g. becomes:
143
+ ## arable land / permanent crops / permanent pasture: for key ??
144
+ span_names = div.css( 'span.subfield-name')
145
+ keys = []
146
+ span_names.each do |span|
147
+ keys << normalize_category( span.text.strip )
148
+ span.replace( '' )
149
+ end
150
+ key = keys.join( ' / ')
151
+ text = squish( div.text.strip )
152
+ puts "category_data key >#{key}<: >#{text}<"
153
+ data[ key ] = { 'text' => text }
154
+ else
155
+ ## get subfield name
156
+ span_names = div.css( 'span.subfield-name')
157
+ if span_names.size > 1
158
+ puts "!! ERROR - found more than one subfield-name:"
159
+ puts div.to_html
160
+ exit 1
161
+ end
162
+ key = normalize_category( span_names[0].text.strip )
163
+ span_names[0].replace( '' )
164
+
165
+ text = squish( div.text.strip )
166
+ puts "category_data key >#{key}<: >#{text}<"
167
+ data[ key ] = { 'text' => text }
168
+ end
169
+ end
170
+
171
+
172
+ pp data
173
+ data
174
+ end
175
+
176
+
177
+
178
+
179
+ def squish( str )
180
+ str.gsub( /[ \t\n\r]{2,}/, ' ') ## replace multi-spaces (incl. newlines with once space)
181
+ end
182
+
183
+ end # class ItemBuilder
184
+
185
+ end # module Factbook