factbook 1.1.1 → 2.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/{HISTORY.md → CHANGELOG.md} +3 -3
- data/Manifest.txt +1 -58
- data/README.md +50 -575
- data/Rakefile +29 -33
- data/lib/factbook.rb +8 -75
- metadata +20 -114
- data/data/attributes.yml +0 -337
- data/data/categories.csv +0 -164
- data/data/codes.csv +0 -262
- data/data/codesxref.csv +0 -280
- data/data/comparisons.csv +0 -75
- data/lib/factbook/almanac.rb +0 -72
- data/lib/factbook/attributes.rb +0 -74
- data/lib/factbook/builder.rb +0 -214
- data/lib/factbook/builder_item.rb +0 -92
- data/lib/factbook/builder_json.rb +0 -79
- data/lib/factbook/codes.rb +0 -119
- data/lib/factbook/comparisons.rb +0 -50
- data/lib/factbook/counter.rb +0 -48
- data/lib/factbook/db/importer.rb +0 -92
- data/lib/factbook/db/models.rb +0 -11
- data/lib/factbook/db/schema.rb +0 -36
- data/lib/factbook/normalize.rb +0 -43
- data/lib/factbook/page.rb +0 -185
- data/lib/factbook/page_info.rb +0 -12
- data/lib/factbook/reader_json.rb +0 -51
- data/lib/factbook/sanitizer.rb +0 -207
- data/lib/factbook/sect.rb +0 -29
- data/lib/factbook/subsect.rb +0 -18
- data/lib/factbook/table.rb +0 -52
- data/lib/factbook/utils.rb +0 -85
- data/lib/factbook/utils_info.rb +0 -102
- data/lib/factbook/version.rb +0 -22
- data/script/almanac.rb +0 -48
- data/script/attributes.rb +0 -34
- data/script/build.rb +0 -28
- data/script/counter.rb +0 -145
- data/script/json.rb +0 -18
- data/script/testbr.rb +0 -33
- data/script/testcodes.rb +0 -11
- data/test/data/au.html +0 -579
- data/test/data/au.yml +0 -8
- data/test/data/be.html +0 -596
- data/test/data/be.yml +0 -8
- data/test/data/json/au.json +0 -892
- data/test/data/src/au.html +0 -2006
- data/test/data/src/be.html +0 -2011
- data/test/helper.rb +0 -11
- data/test/test_attribs.rb +0 -82
- data/test/test_attribs_def.rb +0 -20
- data/test/test_builder.rb +0 -35
- data/test/test_codes.rb +0 -76
- data/test/test_comparisons.rb +0 -19
- data/test/test_convert.rb +0 -30
- data/test/test_counter.rb +0 -31
- data/test/test_fields.rb +0 -52
- data/test/test_importer.rb +0 -55
- data/test/test_item_builder.rb +0 -99
- data/test/test_json.rb +0 -44
- data/test/test_json_builder.rb +0 -25
- data/test/test_normalize.rb +0 -23
- data/test/test_page.rb +0 -38
- data/test/test_sanitizer.rb +0 -35
data/data/comparisons.csv
DELETED
@@ -1,75 +0,0 @@
|
|
1
|
-
Num,Category,Name
|
2
|
-
2147,Geography,Area
|
3
|
-
2119,People and Society,Population
|
4
|
-
2002,People and Society,Population growth rate
|
5
|
-
2054,People and Society,Birth rate
|
6
|
-
2066,People and Society,Death rate
|
7
|
-
2112,People and Society,Net migration rate
|
8
|
-
2223,People and Society,Maternal mortality rate
|
9
|
-
2091,People and Society,Infant mortality rate
|
10
|
-
2102,People and Society,Life expectancy at birth
|
11
|
-
2127,People and Society,Total fertility rate
|
12
|
-
2225,People and Society,Health expenditures
|
13
|
-
2155,People and Society,HIV/AIDS - adult prevalence rate
|
14
|
-
2156,People and Society,HIV/AIDS - people living with HIV/AIDS
|
15
|
-
2157,People and Society,HIV/AIDS - deaths
|
16
|
-
2228,People and Society,Obesity - adult prevalence rate
|
17
|
-
2224,People and Society,Children under the age of 5 years underweight
|
18
|
-
2206,People and Society,Education expenditures
|
19
|
-
2229,People and Society,"Unemployment, youth ages 15-24"
|
20
|
-
2001,Economy,GDP (purchasing power parity)
|
21
|
-
2003,Economy,GDP - real growth rate
|
22
|
-
2004,Economy,GDP - per capita (PPP)
|
23
|
-
2260,Economy,Gross national saving
|
24
|
-
2089,Economy,Industrial production growth rate
|
25
|
-
2095,Economy,Labor force
|
26
|
-
2129,Economy,Unemployment rate
|
27
|
-
2172,Economy,Distribution of family income - Gini index
|
28
|
-
2221,Economy,Taxes and other revenues
|
29
|
-
2222,Economy,Budget surplus (+) or deficit (-)
|
30
|
-
2186,Economy,Public debt
|
31
|
-
2092,Economy,Inflation rate (consumer prices)
|
32
|
-
2207,Economy,Central bank discount rate
|
33
|
-
2208,Economy,Commercial bank prime lending rate
|
34
|
-
2214,Economy,Stock of narrow money
|
35
|
-
2215,Economy,Stock of broad money
|
36
|
-
2211,Economy,Stock of domestic credit
|
37
|
-
2200,Economy,Market value of publicly traded shares
|
38
|
-
2187,Economy,Current account balance
|
39
|
-
2078,Economy,Exports
|
40
|
-
2087,Economy,Imports
|
41
|
-
2188,Economy,Reserves of foreign exchange and gold
|
42
|
-
2079,Economy,Debt - external
|
43
|
-
2198,Economy,Stock of direct foreign investment - at home
|
44
|
-
2199,Economy,Stock of direct foreign investment - abroad
|
45
|
-
2232,Energy,Electricity - production
|
46
|
-
2233,Energy,Electricity - consumption
|
47
|
-
2234,Energy,Electricity - exports
|
48
|
-
2235,Energy,Electricity - imports
|
49
|
-
2236,Energy,Electricity - installed generating capacity
|
50
|
-
2237,Energy,Electricity - from fossil fuels
|
51
|
-
2239,Energy,Electricity - from nuclear fuels
|
52
|
-
2238,Energy,Electricity - from hydroelectric plants
|
53
|
-
2240,Energy,Electricity - from other renewable sources
|
54
|
-
2241,Energy,Crude oil - production
|
55
|
-
2242,Energy,Crude oil - exports
|
56
|
-
2243,Energy,Crude oil - imports
|
57
|
-
2244,Energy,Crude oil - proved reserves
|
58
|
-
2245,Energy,Refined petroleum products - production
|
59
|
-
2246,Energy,Refined petroleum products - consumption
|
60
|
-
2247,Energy,Refined petroleum products - exports
|
61
|
-
2248,Energy,Refined petroleum products - imports
|
62
|
-
2249,Energy,Natural gas - production
|
63
|
-
2250,Energy,Natural gas - consumption
|
64
|
-
2251,Energy,Natural gas - exports
|
65
|
-
2252,Energy,Natural gas - imports
|
66
|
-
2253,Energy,Natural gas - proved reserves
|
67
|
-
2150,Communications,Telephones - fixed lines
|
68
|
-
2151,Communications,Telephones - mobile cellular
|
69
|
-
2153,Communications,Internet users
|
70
|
-
2053,Transportation,Airports
|
71
|
-
2121,Transportation,Railways
|
72
|
-
2085,Transportation,Roadways
|
73
|
-
2093,Transportation,Waterways
|
74
|
-
2108,Transportation,Merchant marine
|
75
|
-
2034,Military,Military expenditures
|
data/lib/factbook/almanac.rb
DELETED
@@ -1,72 +0,0 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
|
-
module Factbook
|
4
|
-
|
5
|
-
|
6
|
-
class Almanac
|
7
|
-
|
8
|
-
## convenience helper ("factory")
|
9
|
-
def self.from_json( codes, json_dir: '.' )
|
10
|
-
pages = JsonPageReader.new( json_dir ).read_pages( codes )
|
11
|
-
self.new( pages )
|
12
|
-
end
|
13
|
-
|
14
|
-
|
15
|
-
def initialize( pages )
|
16
|
-
@pages = pages
|
17
|
-
end
|
18
|
-
|
19
|
-
def render( template )
|
20
|
-
buf = ''
|
21
|
-
@pages.each do |page|
|
22
|
-
text = PageCtx.new( page, template ).render
|
23
|
-
|
24
|
-
puts text ## for debugging write country profile to console (too)
|
25
|
-
buf << text
|
26
|
-
end
|
27
|
-
puts "count: #{@pages.count}"
|
28
|
-
buf ## return buffered almanac text
|
29
|
-
end
|
30
|
-
|
31
|
-
|
32
|
-
class PageCtx
|
33
|
-
attr_accessor :page
|
34
|
-
|
35
|
-
def initialize(page, template)
|
36
|
-
@page = page
|
37
|
-
@template = template
|
38
|
-
end
|
39
|
-
|
40
|
-
##############################
|
41
|
-
## add some "view helpers"
|
42
|
-
|
43
|
-
def name
|
44
|
-
## -- calculate name (use long name if (short) name is not availabe e.g. none)
|
45
|
-
## e.g. Austria
|
46
|
-
if @name.nil?
|
47
|
-
@name = page.name
|
48
|
-
@name = page.name_long if @name == 'none'
|
49
|
-
end
|
50
|
-
@name
|
51
|
-
end
|
52
|
-
|
53
|
-
def names( separator: ' • ' )
|
54
|
-
## e.g. Austria • Österreich
|
55
|
-
if @names.nil?
|
56
|
-
if page.name_local.blank? || page.name_local == 'none' || page.name_local == name
|
57
|
-
@names = [name] ## no local (in its own non-english language) name
|
58
|
-
else
|
59
|
-
@names = [name, page.name_local]
|
60
|
-
end
|
61
|
-
end
|
62
|
-
@names.join( separator )
|
63
|
-
end
|
64
|
-
|
65
|
-
def render
|
66
|
-
ERB.new( @template).result( binding )
|
67
|
-
end
|
68
|
-
end ## PageCtx
|
69
|
-
|
70
|
-
end ## Almanac
|
71
|
-
|
72
|
-
end # module Factbook
|
data/lib/factbook/attributes.rb
DELETED
@@ -1,74 +0,0 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
|
-
|
4
|
-
module Factbook
|
5
|
-
|
6
|
-
class Attributes
|
7
|
-
|
8
|
-
Attribute = Struct.new( :name,
|
9
|
-
:category, ## e.g. Introduction, Geography, etc.
|
10
|
-
:path, ## note: is an array e.g. ["Area - comparative"] or ["Area", "land"] etc.
|
11
|
-
)
|
12
|
-
|
13
|
-
def self.from_yaml( path )
|
14
|
-
|
15
|
-
h = YAML.load_file( path )
|
16
|
-
pp h
|
17
|
-
|
18
|
-
attribs = []
|
19
|
-
|
20
|
-
## note: use a copy (e.g. h.dup) for now (hash gets changed by build_attribs!!)
|
21
|
-
new_h = h.dup
|
22
|
-
new_h.each do |k,v|
|
23
|
-
category = k
|
24
|
-
build_attribs( attribs, category, [], v )
|
25
|
-
end
|
26
|
-
|
27
|
-
self.new( attribs )
|
28
|
-
end
|
29
|
-
|
30
|
-
|
31
|
-
def self.build_attribs( attribs, category, path, h )
|
32
|
-
|
33
|
-
## assume it's an attribute definition hash
|
34
|
-
## note: !! exclude special cases:
|
35
|
-
## Capital -- incl. name key itself
|
36
|
-
## National anthem
|
37
|
-
if h.has_key?( 'name' ) && ['Capital','National anthem'].include?( path[-1] ) == false
|
38
|
-
a = Attribute.new
|
39
|
-
a.name = h['name']
|
40
|
-
a.category = category
|
41
|
-
a.path = path
|
42
|
-
|
43
|
-
puts " adding attribute >#{a.name}< using #{a.category} / #{a.path.inspect}"
|
44
|
-
attribs << a
|
45
|
-
|
46
|
-
## note: make sure a modifable copy (of h) gets passed in
|
47
|
-
h.delete( 'name' )
|
48
|
-
end
|
49
|
-
|
50
|
-
return if h.empty? ## empty hash; nothing (more) to do; return
|
51
|
-
|
52
|
-
## continue walking (recursive)
|
53
|
-
h.each do |k,v|
|
54
|
-
new_path = path.dup << k ## note: create a new array (copy)
|
55
|
-
build_attribs( attribs, category, new_path, v )
|
56
|
-
end
|
57
|
-
end
|
58
|
-
|
59
|
-
|
60
|
-
def initialize( attribs )
|
61
|
-
@attribs = attribs
|
62
|
-
end
|
63
|
-
|
64
|
-
def to_a() @attribs; end
|
65
|
-
def size() @attribs.size; end
|
66
|
-
|
67
|
-
def each
|
68
|
-
@attribs.each { |attrib| yield( attrib ) }
|
69
|
-
end
|
70
|
-
|
71
|
-
end # class Attributes
|
72
|
-
|
73
|
-
end # module Factbook
|
74
|
-
|
data/lib/factbook/builder.rb
DELETED
@@ -1,214 +0,0 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
|
-
module Factbook
|
4
|
-
|
5
|
-
class Builder ## todo: change to PageBuilder ???
|
6
|
-
include LogUtils::Logging
|
7
|
-
|
8
|
-
|
9
|
-
=begin
|
10
|
-
def self.from_cc( cc, opts={} ) ## rename to from_file_for_country() or from_file_for_cc() or something - why?? why not??
|
11
|
-
## check/todo: rename input_dir to just dir or to include ?
|
12
|
-
## (there's no output_dir)?? - why? why not?
|
13
|
-
input_dir = opts[:input_dir] || '.'
|
14
|
-
self.from_file( "#{input_dir}/#{cc}.html" )
|
15
|
-
end
|
16
|
-
=end
|
17
|
-
|
18
|
-
|
19
|
-
def self.from_file( path )
|
20
|
-
html_ascii = File.read( path ) ## fix/todo: use ASCII8BIT/binary reader !!!!!
|
21
|
-
self.from_string( html_ascii )
|
22
|
-
end
|
23
|
-
|
24
|
-
def self.from_string( html_ascii ) ## note: expects ASCII-7BIT/BINARY encoding
|
25
|
-
self.new( html_ascii )
|
26
|
-
end
|
27
|
-
|
28
|
-
|
29
|
-
attr_reader :html_ascii, ## full "original" 1:1 page in "original/ascii8/binary" encoding
|
30
|
-
:html, ## utf-8 encoded profile
|
31
|
-
:html_debug, ## html w/ mapping markers - rename to html_markers - why? why not?
|
32
|
-
:info, ## page info incl. country_name, region_name, last_updated etc.
|
33
|
-
:errors, ## encoding erros etc.
|
34
|
-
:sects
|
35
|
-
|
36
|
-
|
37
|
-
def initialize( html_ascii )
|
38
|
-
@html_ascii = html_ascii
|
39
|
-
|
40
|
-
## todo/fix: use/assume windows 12xx?? encoding - change encoding to utf-8 (from binary/ascii8bit)
|
41
|
-
@html, @info, @errors = Sanitizer.new.sanitize( @html_ascii )
|
42
|
-
|
43
|
-
@html_debug = map_sects( @html )
|
44
|
-
@html_debug = map_subsects( @html_debug )
|
45
|
-
|
46
|
-
html_sects = split_sects( @html_debug )
|
47
|
-
pp html_sects
|
48
|
-
|
49
|
-
|
50
|
-
@sects = []
|
51
|
-
html_sects.each do |html_sect|
|
52
|
-
html_sect_head = html_sect[0]
|
53
|
-
html_subsects = html_sect[1]
|
54
|
-
puts html_sect_head
|
55
|
-
puts html_subsects.size
|
56
|
-
|
57
|
-
## get section title
|
58
|
-
## @SECTION{Economy} => Economy
|
59
|
-
if html_sect_head =~ /@SECTION{(.+?)}/
|
60
|
-
title = $1.strip
|
61
|
-
puts title
|
62
|
-
sect = Sect.new
|
63
|
-
sect.title = title
|
64
|
-
## get subsections
|
65
|
-
subsects = []
|
66
|
-
html_subsects.each do |html_subsect|
|
67
|
-
html_subsect_head = html_subsect[0]
|
68
|
-
html_subsect_body = html_subsect[1]
|
69
|
-
if html_subsect_head =~ /@SUBSECTION{(.+?)}/
|
70
|
-
title = $1.strip
|
71
|
-
title = title.sub( /:\z/, '' ) # remove trailing : if present
|
72
|
-
title = title.strip
|
73
|
-
|
74
|
-
puts title
|
75
|
-
subsect = Subsect.new
|
76
|
-
subsect.title = title ## todo/fix: cut off trailing colon (:)
|
77
|
-
|
78
|
-
b = Factbook::ItemBuilder.new( html_subsect_body, title )
|
79
|
-
h = b.read
|
80
|
-
subsect.data = h
|
81
|
-
|
82
|
-
subsects << subsect
|
83
|
-
else
|
84
|
-
## warn/fix: no subsection title found
|
85
|
-
end
|
86
|
-
end
|
87
|
-
sect.subsects = subsects
|
88
|
-
@sects << sect
|
89
|
-
else
|
90
|
-
## warn/fix: no section title found
|
91
|
-
end
|
92
|
-
end
|
93
|
-
|
94
|
-
self ## return self -- needed?? default (standard) anyway?? check and remove
|
95
|
-
end
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
def map_sects( html )
|
100
|
-
## convert section titles
|
101
|
-
## from <h2>..</h2>
|
102
|
-
## to "unified" marker
|
103
|
-
|
104
|
-
## e.g.
|
105
|
-
## <h2 sectiontitle='Introduction' ccode='au'>Introduction :: <span class='region'>AUSTRIA </span></h2>
|
106
|
-
## <h2>Introduction</h2>
|
107
|
-
|
108
|
-
title_regex= /<h2
|
109
|
-
(?:\s[^>]+)? ## allow optional attributes in h2
|
110
|
-
>
|
111
|
-
\s*
|
112
|
-
([^<>]+?) ## note: use non-greedy; do NOT allow tags inside for now
|
113
|
-
\s*
|
114
|
-
(?:\s::\s
|
115
|
-
.+? ## note: use non-greedy; allows tags inside
|
116
|
-
)? ## strip optional name (e.g. :: AUSTRIA)
|
117
|
-
<\/h2>
|
118
|
-
/xim
|
119
|
-
|
120
|
-
html = html.gsub( title_regex ) do |m|
|
121
|
-
puts "** found section >#{$1}<:"
|
122
|
-
puts " >|#{m}|<"
|
123
|
-
|
124
|
-
"\n\n@SECTION{#{$1}}\n\n"
|
125
|
-
end
|
126
|
-
html
|
127
|
-
end
|
128
|
-
|
129
|
-
|
130
|
-
def map_subsects( html )
|
131
|
-
## convert subsection titles
|
132
|
-
## from <div id='field'>..</div>
|
133
|
-
## to "unified" marker
|
134
|
-
|
135
|
-
## e.g.
|
136
|
-
## <div id='field' class='category'>Disputes - international:</div>
|
137
|
-
|
138
|
-
title_regex= /<div \s id='field'
|
139
|
-
\s class='category'>
|
140
|
-
\s*
|
141
|
-
(.+?) ## note: use non-greedy; allows tags inside - why? why not
|
142
|
-
\s*
|
143
|
-
<\/div>
|
144
|
-
/xim
|
145
|
-
|
146
|
-
html = html.gsub( title_regex ) do |m|
|
147
|
-
puts "** found subsection >#{$1}<:"
|
148
|
-
puts " >|#{m}|<"
|
149
|
-
|
150
|
-
"\n@SUBSECTION{#{$1}}\n"
|
151
|
-
end
|
152
|
-
html
|
153
|
-
end
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
def split_sects( html )
|
158
|
-
####
|
159
|
-
# split html in sections (divided by section headings)
|
160
|
-
# e.g. remove optional prolog ??,
|
161
|
-
## [[heading,sect],
|
162
|
-
## [heading,sect],
|
163
|
-
## [heading,sect],...]
|
164
|
-
|
165
|
-
## note: "wrap" regex in a capture group (just one)
|
166
|
-
## String#split will include all catpure groups in the result array
|
167
|
-
|
168
|
-
section_regex= /(@SECTION{.+?})/ ## note: use non-greedy -- check: need to escape {} ??
|
169
|
-
|
170
|
-
chunks = html.split( section_regex )
|
171
|
-
|
172
|
-
## check if first item is a section or (html) prolog
|
173
|
-
# if prolog (remove)
|
174
|
-
chunks.slice!(0) unless chunks[0] =~ /@SECTION/ ## starts w/ @SECTION
|
175
|
-
|
176
|
-
pairs = chunks.each_slice(2).to_a
|
177
|
-
|
178
|
-
## now split subsections
|
179
|
-
newpairs = []
|
180
|
-
pairs.each do |item|
|
181
|
-
## todo: after cleanup prolog; remove @SECTION{} ?? - just keep title - why, why not??
|
182
|
-
newpairs << [item[0], split_subsects( item[1]) ]
|
183
|
-
end
|
184
|
-
newpairs
|
185
|
-
end
|
186
|
-
|
187
|
-
|
188
|
-
def split_subsects( html )
|
189
|
-
####
|
190
|
-
# split html in subsections (divided by subsection headings)
|
191
|
-
# e.g. remove optional prolog ??,
|
192
|
-
## [[heading,sect],
|
193
|
-
## [heading,sect],
|
194
|
-
## [heading,sect],...]
|
195
|
-
|
196
|
-
## note: "wrap" regex in a capture group (just one)
|
197
|
-
## String#split will include all catpure groups in the result array
|
198
|
-
|
199
|
-
subsection_regex= /(@SUBSECTION{.+?})/ ## note: use non-greedy -- check: need to escape {} ??
|
200
|
-
|
201
|
-
chunks = html.split( subsection_regex )
|
202
|
-
|
203
|
-
## check if first item is a section or (html) prolog
|
204
|
-
# if prolog (remove)
|
205
|
-
chunks.slice!(0) unless chunks[0] =~ /@SUBSECTION/ ## starts w/ @SUBSECTION
|
206
|
-
|
207
|
-
pairs = chunks.each_slice(2).to_a
|
208
|
-
pairs
|
209
|
-
end
|
210
|
-
|
211
|
-
end # class Builder
|
212
|
-
|
213
|
-
|
214
|
-
end # module Factbook
|
@@ -1,92 +0,0 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
|
-
module Factbook
|
4
|
-
|
5
|
-
class ItemBuilder ## renameto ItemReader, ItemParser - why? why not??
|
6
|
-
include LogUtils::Logging
|
7
|
-
include NormalizeHelper ## e.g. normalize_category
|
8
|
-
|
9
|
-
def initialize( html, name )
|
10
|
-
@html = html
|
11
|
-
@name = name # add category/field name e.g. Area, Location, etc.
|
12
|
-
end
|
13
|
-
|
14
|
-
def read
|
15
|
-
## return hash from html snippet
|
16
|
-
doc = Nokogiri::HTML.fragment( @html )
|
17
|
-
|
18
|
-
data = {}
|
19
|
-
last_node = nil ## track last hash (always use text key)
|
20
|
-
last_node_data_count = 0
|
21
|
-
|
22
|
-
## note:
|
23
|
-
## skip whitespace text nodes (e.g. \n\n etc); just use divs
|
24
|
-
doc.children.filter('div').each_with_index do |child,i|
|
25
|
-
|
26
|
-
if child['class'] == 'category_data'
|
27
|
-
text = child.text ## fix/todo: use strip
|
28
|
-
puts "category_data: >#{text}<"
|
29
|
-
|
30
|
-
if last_node.nil?
|
31
|
-
## assume its the very first entry; use implied/auto-created category
|
32
|
-
data['text'] = ''
|
33
|
-
last_node = data
|
34
|
-
last_node_data_count = 0
|
35
|
-
end
|
36
|
-
|
37
|
-
### first category_data element?
|
38
|
-
if last_node_data_count == 0
|
39
|
-
if last_node['text'] == ''
|
40
|
-
last_node['text'] = text
|
41
|
-
else ### possible ??? if data_count is zero - not should not include any data
|
42
|
-
## todo: issue warning here - why? why not??
|
43
|
-
last_node['text'] += " #{text}" ## append w/o separator
|
44
|
-
end
|
45
|
-
else
|
46
|
-
if @name == 'Demographic profile' ## special case (use space a sep)
|
47
|
-
last_node['text'] += " #{text}" ## append without (w/o) separator
|
48
|
-
else
|
49
|
-
last_node['text'] += " ++ #{text}" ## append with ++ separator
|
50
|
-
end
|
51
|
-
end
|
52
|
-
last_node_data_count += 1
|
53
|
-
|
54
|
-
elsif child['class'].nil? ## div without any class e.g. <div>..</div>
|
55
|
-
## assume category and category_data pair w/ spans
|
56
|
-
spans = child.children.filter('span')
|
57
|
-
if spans.size > 2
|
58
|
-
puts "*** warn: expected two (or one) spans; got #{spans.inspect}"
|
59
|
-
end
|
60
|
-
|
61
|
-
## pp spans
|
62
|
-
|
63
|
-
span_key = spans[0] ## assume 1st entry is span.category
|
64
|
-
span_value = spans[1] ## assume 2nd entry is span.category_data
|
65
|
-
|
66
|
-
key = normalize_category( span_key.text )
|
67
|
-
|
68
|
-
## note: allow optional category_data for now
|
69
|
-
value = span_value ? span_value.text : nil
|
70
|
-
|
71
|
-
puts "key: >#{key}<, value: >#{value}< : #{value.class.name}"
|
72
|
-
|
73
|
-
## start new pair
|
74
|
-
last_node = data[key] = { 'text' => value }
|
75
|
-
last_node_data_count = value ? 1 : 0 ## note: set to 1 if value present
|
76
|
-
else
|
77
|
-
puts "*** warn: item builder -- unknow css class in #{child.inspect}"
|
78
|
-
end
|
79
|
-
|
80
|
-
## pp child
|
81
|
-
## css = child['class']
|
82
|
-
## puts "[#{i}] #{child.name} class='>#{css}< : #{css.class.name}' >#{child.text}<"
|
83
|
-
end
|
84
|
-
|
85
|
-
pp data
|
86
|
-
data
|
87
|
-
end
|
88
|
-
|
89
|
-
|
90
|
-
end # class ItemBuilder
|
91
|
-
|
92
|
-
end # module Factbook
|