factbook 2.0.0 → 2.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +1 -1
- data/Manifest.txt +0 -61
- data/README.md +8 -506
- data/Rakefile +4 -9
- data/lib/factbook.rb +4 -64
- metadata +6 -124
- data/data/attributes.yml +0 -337
- data/data/categories.csv +0 -164
- data/data/codes.csv +0 -262
- data/data/codesxref.csv +0 -280
- data/data/comparisons.csv +0 -75
- data/lib/factbook/almanac.rb +0 -72
- data/lib/factbook/attributes.rb +0 -74
- data/lib/factbook/builder.rb +0 -212
- data/lib/factbook/builder_item.rb +0 -126
- data/lib/factbook/builder_json.rb +0 -79
- data/lib/factbook/codes.rb +0 -119
- data/lib/factbook/comparisons.rb +0 -50
- data/lib/factbook/counter.rb +0 -48
- data/lib/factbook/db/importer.rb +0 -92
- data/lib/factbook/db/models.rb +0 -11
- data/lib/factbook/db/schema.rb +0 -36
- data/lib/factbook/normalize.rb +0 -43
- data/lib/factbook/page.rb +0 -148
- data/lib/factbook/page_info.rb +0 -12
- data/lib/factbook/reader_json.rb +0 -51
- data/lib/factbook/sanitizer.rb +0 -178
- data/lib/factbook/sect.rb +0 -29
- data/lib/factbook/subsect.rb +0 -18
- data/lib/factbook/table.rb +0 -52
- data/lib/factbook/utils.rb +0 -85
- data/lib/factbook/utils_info.rb +0 -129
- data/lib/factbook/version.rb +0 -21
- data/script/almanac.rb +0 -48
- data/script/attributes.rb +0 -34
- data/script/build.rb +0 -28
- data/script/counter.rb +0 -145
- data/script/json.rb +0 -19
- data/script/testbr.rb +0 -33
- data/script/testcodes.rb +0 -11
- data/test/data/au.html +0 -579
- data/test/data/au.yml +0 -8
- data/test/data/be.html +0 -596
- data/test/data/be.yml +0 -8
- data/test/data/json/au.json +0 -892
- data/test/data/src/ag.html +0 -716
- data/test/data/src/au-2015-09-24.html +0 -2006
- data/test/data/src/au.html +0 -658
- data/test/data/src/be-2015-09-24.html +0 -2011
- data/test/data/src/be.html +0 -648
- data/test/helper.rb +0 -11
- data/test/test_attribs.rb +0 -87
- data/test/test_attribs_def.rb +0 -20
- data/test/test_builder.rb +0 -35
- data/test/test_codes.rb +0 -76
- data/test/test_comparisons.rb +0 -19
- data/test/test_convert.rb +0 -30
- data/test/test_counter.rb +0 -31
- data/test/test_fields.rb +0 -52
- data/test/test_importer.rb +0 -56
- data/test/test_item_builder.rb +0 -99
- data/test/test_json.rb +0 -45
- data/test/test_json_builder.rb +0 -25
- data/test/test_normalize.rb +0 -23
- data/test/test_page.rb +0 -38
- data/test/test_sanitizer.rb +0 -39
- data/test/test_sanitizer_regex.rb +0 -89
data/data/comparisons.csv
DELETED
@@ -1,75 +0,0 @@
|
|
1
|
-
Num,Category,Name
|
2
|
-
2147,Geography,Area
|
3
|
-
2119,People and Society,Population
|
4
|
-
2002,People and Society,Population growth rate
|
5
|
-
2054,People and Society,Birth rate
|
6
|
-
2066,People and Society,Death rate
|
7
|
-
2112,People and Society,Net migration rate
|
8
|
-
2223,People and Society,Maternal mortality rate
|
9
|
-
2091,People and Society,Infant mortality rate
|
10
|
-
2102,People and Society,Life expectancy at birth
|
11
|
-
2127,People and Society,Total fertility rate
|
12
|
-
2225,People and Society,Health expenditures
|
13
|
-
2155,People and Society,HIV/AIDS - adult prevalence rate
|
14
|
-
2156,People and Society,HIV/AIDS - people living with HIV/AIDS
|
15
|
-
2157,People and Society,HIV/AIDS - deaths
|
16
|
-
2228,People and Society,Obesity - adult prevalence rate
|
17
|
-
2224,People and Society,Children under the age of 5 years underweight
|
18
|
-
2206,People and Society,Education expenditures
|
19
|
-
2229,People and Society,"Unemployment, youth ages 15-24"
|
20
|
-
2001,Economy,GDP (purchasing power parity)
|
21
|
-
2003,Economy,GDP - real growth rate
|
22
|
-
2004,Economy,GDP - per capita (PPP)
|
23
|
-
2260,Economy,Gross national saving
|
24
|
-
2089,Economy,Industrial production growth rate
|
25
|
-
2095,Economy,Labor force
|
26
|
-
2129,Economy,Unemployment rate
|
27
|
-
2172,Economy,Distribution of family income - Gini index
|
28
|
-
2221,Economy,Taxes and other revenues
|
29
|
-
2222,Economy,Budget surplus (+) or deficit (-)
|
30
|
-
2186,Economy,Public debt
|
31
|
-
2092,Economy,Inflation rate (consumer prices)
|
32
|
-
2207,Economy,Central bank discount rate
|
33
|
-
2208,Economy,Commercial bank prime lending rate
|
34
|
-
2214,Economy,Stock of narrow money
|
35
|
-
2215,Economy,Stock of broad money
|
36
|
-
2211,Economy,Stock of domestic credit
|
37
|
-
2200,Economy,Market value of publicly traded shares
|
38
|
-
2187,Economy,Current account balance
|
39
|
-
2078,Economy,Exports
|
40
|
-
2087,Economy,Imports
|
41
|
-
2188,Economy,Reserves of foreign exchange and gold
|
42
|
-
2079,Economy,Debt - external
|
43
|
-
2198,Economy,Stock of direct foreign investment - at home
|
44
|
-
2199,Economy,Stock of direct foreign investment - abroad
|
45
|
-
2232,Energy,Electricity - production
|
46
|
-
2233,Energy,Electricity - consumption
|
47
|
-
2234,Energy,Electricity - exports
|
48
|
-
2235,Energy,Electricity - imports
|
49
|
-
2236,Energy,Electricity - installed generating capacity
|
50
|
-
2237,Energy,Electricity - from fossil fuels
|
51
|
-
2239,Energy,Electricity - from nuclear fuels
|
52
|
-
2238,Energy,Electricity - from hydroelectric plants
|
53
|
-
2240,Energy,Electricity - from other renewable sources
|
54
|
-
2241,Energy,Crude oil - production
|
55
|
-
2242,Energy,Crude oil - exports
|
56
|
-
2243,Energy,Crude oil - imports
|
57
|
-
2244,Energy,Crude oil - proved reserves
|
58
|
-
2245,Energy,Refined petroleum products - production
|
59
|
-
2246,Energy,Refined petroleum products - consumption
|
60
|
-
2247,Energy,Refined petroleum products - exports
|
61
|
-
2248,Energy,Refined petroleum products - imports
|
62
|
-
2249,Energy,Natural gas - production
|
63
|
-
2250,Energy,Natural gas - consumption
|
64
|
-
2251,Energy,Natural gas - exports
|
65
|
-
2252,Energy,Natural gas - imports
|
66
|
-
2253,Energy,Natural gas - proved reserves
|
67
|
-
2150,Communications,Telephones - fixed lines
|
68
|
-
2151,Communications,Telephones - mobile cellular
|
69
|
-
2153,Communications,Internet users
|
70
|
-
2053,Transportation,Airports
|
71
|
-
2121,Transportation,Railways
|
72
|
-
2085,Transportation,Roadways
|
73
|
-
2093,Transportation,Waterways
|
74
|
-
2108,Transportation,Merchant marine
|
75
|
-
2034,Military,Military expenditures
|
data/lib/factbook/almanac.rb
DELETED
@@ -1,72 +0,0 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
|
-
module Factbook
|
4
|
-
|
5
|
-
|
6
|
-
class Almanac
|
7
|
-
|
8
|
-
## convenience helper ("factory")
|
9
|
-
def self.from_json( codes, json_dir: '.' )
|
10
|
-
pages = JsonPageReader.new( json_dir ).read_pages( codes )
|
11
|
-
self.new( pages )
|
12
|
-
end
|
13
|
-
|
14
|
-
|
15
|
-
def initialize( pages )
|
16
|
-
@pages = pages
|
17
|
-
end
|
18
|
-
|
19
|
-
def render( template )
|
20
|
-
buf = ''
|
21
|
-
@pages.each do |page|
|
22
|
-
text = PageCtx.new( page, template ).render
|
23
|
-
|
24
|
-
puts text ## for debugging write country profile to console (too)
|
25
|
-
buf << text
|
26
|
-
end
|
27
|
-
puts "count: #{@pages.count}"
|
28
|
-
buf ## return buffered almanac text
|
29
|
-
end
|
30
|
-
|
31
|
-
|
32
|
-
class PageCtx
|
33
|
-
attr_accessor :page
|
34
|
-
|
35
|
-
def initialize(page, template)
|
36
|
-
@page = page
|
37
|
-
@template = template
|
38
|
-
end
|
39
|
-
|
40
|
-
##############################
|
41
|
-
## add some "view helpers"
|
42
|
-
|
43
|
-
def name
|
44
|
-
## -- calculate name (use long name if (short) name is not availabe e.g. none)
|
45
|
-
## e.g. Austria
|
46
|
-
if @name.nil?
|
47
|
-
@name = page.name
|
48
|
-
@name = page.name_long if @name == 'none'
|
49
|
-
end
|
50
|
-
@name
|
51
|
-
end
|
52
|
-
|
53
|
-
def names( separator: ' • ' )
|
54
|
-
## e.g. Austria • Österreich
|
55
|
-
if @names.nil?
|
56
|
-
if page.name_local.blank? || page.name_local == 'none' || page.name_local == name
|
57
|
-
@names = [name] ## no local (in its own non-english language) name
|
58
|
-
else
|
59
|
-
@names = [name, page.name_local]
|
60
|
-
end
|
61
|
-
end
|
62
|
-
@names.join( separator )
|
63
|
-
end
|
64
|
-
|
65
|
-
def render
|
66
|
-
ERB.new( @template).result( binding )
|
67
|
-
end
|
68
|
-
end ## PageCtx
|
69
|
-
|
70
|
-
end ## Almanac
|
71
|
-
|
72
|
-
end # module Factbook
|
data/lib/factbook/attributes.rb
DELETED
@@ -1,74 +0,0 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
|
-
|
4
|
-
module Factbook
|
5
|
-
|
6
|
-
class Attributes
|
7
|
-
|
8
|
-
Attribute = Struct.new( :name,
|
9
|
-
:category, ## e.g. Introduction, Geography, etc.
|
10
|
-
:path, ## note: is an array e.g. ["Area - comparative"] or ["Area", "land"] etc.
|
11
|
-
)
|
12
|
-
|
13
|
-
def self.from_yaml( path )
|
14
|
-
|
15
|
-
h = YAML.load_file( path )
|
16
|
-
pp h
|
17
|
-
|
18
|
-
attribs = []
|
19
|
-
|
20
|
-
## note: use a copy (e.g. h.dup) for now (hash gets changed by build_attribs!!)
|
21
|
-
new_h = h.dup
|
22
|
-
new_h.each do |k,v|
|
23
|
-
category = k
|
24
|
-
build_attribs( attribs, category, [], v )
|
25
|
-
end
|
26
|
-
|
27
|
-
self.new( attribs )
|
28
|
-
end
|
29
|
-
|
30
|
-
|
31
|
-
def self.build_attribs( attribs, category, path, h )
|
32
|
-
|
33
|
-
## assume it's an attribute definition hash
|
34
|
-
## note: !! exclude special cases:
|
35
|
-
## Capital -- incl. name key itself
|
36
|
-
## National anthem
|
37
|
-
if h.has_key?( 'name' ) && ['Capital','National anthem'].include?( path[-1] ) == false
|
38
|
-
a = Attribute.new
|
39
|
-
a.name = h['name']
|
40
|
-
a.category = category
|
41
|
-
a.path = path
|
42
|
-
|
43
|
-
puts " adding attribute >#{a.name}< using #{a.category} / #{a.path.inspect}"
|
44
|
-
attribs << a
|
45
|
-
|
46
|
-
## note: make sure a modifable copy (of h) gets passed in
|
47
|
-
h.delete( 'name' )
|
48
|
-
end
|
49
|
-
|
50
|
-
return if h.empty? ## empty hash; nothing (more) to do; return
|
51
|
-
|
52
|
-
## continue walking (recursive)
|
53
|
-
h.each do |k,v|
|
54
|
-
new_path = path.dup << k ## note: create a new array (copy)
|
55
|
-
build_attribs( attribs, category, new_path, v )
|
56
|
-
end
|
57
|
-
end
|
58
|
-
|
59
|
-
|
60
|
-
def initialize( attribs )
|
61
|
-
@attribs = attribs
|
62
|
-
end
|
63
|
-
|
64
|
-
def to_a() @attribs; end
|
65
|
-
def size() @attribs.size; end
|
66
|
-
|
67
|
-
def each
|
68
|
-
@attribs.each { |attrib| yield( attrib ) }
|
69
|
-
end
|
70
|
-
|
71
|
-
end # class Attributes
|
72
|
-
|
73
|
-
end # module Factbook
|
74
|
-
|
data/lib/factbook/builder.rb
DELETED
@@ -1,212 +0,0 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
|
-
module Factbook
|
4
|
-
|
5
|
-
class Builder ## todo: change to PageBuilder ???
|
6
|
-
include LogUtils::Logging
|
7
|
-
|
8
|
-
|
9
|
-
=begin
|
10
|
-
def self.from_cc( cc, opts={} ) ## rename to from_file_for_country() or from_file_for_cc() or something - why?? why not??
|
11
|
-
## check/todo: rename input_dir to just dir or to include ?
|
12
|
-
## (there's no output_dir)?? - why? why not?
|
13
|
-
input_dir = opts[:input_dir] || '.'
|
14
|
-
self.from_file( "#{input_dir}/#{cc}.html" )
|
15
|
-
end
|
16
|
-
=end
|
17
|
-
|
18
|
-
|
19
|
-
def self.from_file( path )
|
20
|
-
html_ascii = File.read( path ) ## fix/todo: use ASCII8BIT/binary reader !!!!!
|
21
|
-
self.from_string( html_ascii )
|
22
|
-
end
|
23
|
-
|
24
|
-
def self.from_string( html_ascii ) ## note: expects ASCII-7BIT/BINARY encoding
|
25
|
-
self.new( html_ascii )
|
26
|
-
end
|
27
|
-
|
28
|
-
|
29
|
-
attr_reader :html_ascii, ## full "original" 1:1 page in "original/ascii8/binary" encoding
|
30
|
-
:html, ## utf-8 encoded profile
|
31
|
-
:html_debug, ## html w/ mapping markers - rename to html_markers - why? why not?
|
32
|
-
:info, ## page info incl. country_name, region_name, last_updated etc.
|
33
|
-
:errors, ## encoding erros etc.
|
34
|
-
:sects
|
35
|
-
|
36
|
-
|
37
|
-
def initialize( html_ascii )
|
38
|
-
@html_ascii = html_ascii
|
39
|
-
|
40
|
-
## todo/fix: use/assume windows 12xx?? encoding - change encoding to utf-8 (from binary/ascii8bit)
|
41
|
-
@html, @info, @errors = Sanitizer.new.sanitize( @html_ascii )
|
42
|
-
|
43
|
-
|
44
|
-
html_sects = if @html.empty?
|
45
|
-
## note: support "empty" pages - old format waiting for update!!!
|
46
|
-
## cannot parse for now
|
47
|
-
[] ## return empty (no) sections for now - sorry (its just one page with code cc anyway!!)
|
48
|
-
else
|
49
|
-
@html_debug = map_sects( @html )
|
50
|
-
@html_debug = map_subsects( @html_debug )
|
51
|
-
|
52
|
-
split_sects( @html_debug )
|
53
|
-
end
|
54
|
-
|
55
|
-
pp html_sects
|
56
|
-
|
57
|
-
## debug
|
58
|
-
## File.open( 'tmp/br.debug.html', 'w:utf-8') { |f| f.write( @html_debug ) }
|
59
|
-
|
60
|
-
|
61
|
-
@sects = []
|
62
|
-
html_sects.each do |html_sect|
|
63
|
-
html_sect_head = html_sect[0]
|
64
|
-
html_subsects = html_sect[1]
|
65
|
-
puts html_sect_head
|
66
|
-
puts html_subsects.size
|
67
|
-
|
68
|
-
## get section title
|
69
|
-
## @SECTION{Economy} => Economy
|
70
|
-
if html_sect_head =~ /@SECTION{(.+?)}/
|
71
|
-
title = $1.strip
|
72
|
-
puts title
|
73
|
-
sect = Sect.new
|
74
|
-
sect.title = title
|
75
|
-
## get subsections
|
76
|
-
subsects = []
|
77
|
-
html_subsects.each do |html_subsect|
|
78
|
-
html_subsect_head = html_subsect[0]
|
79
|
-
html_subsect_body = html_subsect[1]
|
80
|
-
if html_subsect_head =~ /@SUBSECTION{(.+?)}/
|
81
|
-
title = $1.strip
|
82
|
-
title = title.sub( /:\z/, '' ) # remove trailing : if present
|
83
|
-
title = title.strip
|
84
|
-
|
85
|
-
puts title
|
86
|
-
subsect = Subsect.new
|
87
|
-
subsect.title = title ## todo/fix: cut off trailing colon (:)
|
88
|
-
|
89
|
-
b = Factbook::ItemBuilder.new( html_subsect_body, title )
|
90
|
-
h = b.read
|
91
|
-
subsect.data = h
|
92
|
-
|
93
|
-
subsects << subsect
|
94
|
-
else
|
95
|
-
## warn/fix: no subsection title found
|
96
|
-
end
|
97
|
-
end
|
98
|
-
sect.subsects = subsects
|
99
|
-
@sects << sect
|
100
|
-
else
|
101
|
-
## warn/fix: no section title found
|
102
|
-
end
|
103
|
-
end
|
104
|
-
|
105
|
-
self ## return self -- needed?? default (standard) anyway?? check and remove
|
106
|
-
end
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
def map_sects( html )
|
111
|
-
## convert section titles to "unified" marker
|
112
|
-
## e.g.
|
113
|
-
## <h2>Introduction</h2>
|
114
|
-
|
115
|
-
title_regex= /<h2>
|
116
|
-
\s*
|
117
|
-
(.+?) ## note: use non-greedy; do NOT allow tags inside for now
|
118
|
-
\s*
|
119
|
-
<\/h2>
|
120
|
-
/xim
|
121
|
-
|
122
|
-
html = html.gsub( title_regex ) do |m|
|
123
|
-
puts "** found section >#{$1}<:"
|
124
|
-
puts " >|#{m}|<"
|
125
|
-
|
126
|
-
"\n\n@SECTION{#{$1}}\n\n"
|
127
|
-
end
|
128
|
-
html
|
129
|
-
end
|
130
|
-
|
131
|
-
|
132
|
-
def map_subsects( html )
|
133
|
-
## convert subsection titles to "unified" marker
|
134
|
-
## e.g.
|
135
|
-
## <h3>Disputes - international:</h3>
|
136
|
-
|
137
|
-
title_regex= /<h3>
|
138
|
-
\s*
|
139
|
-
(.+?) ## note: use non-greedy; allows tags inside - why? why not
|
140
|
-
\s*
|
141
|
-
<\/h3>
|
142
|
-
/xim
|
143
|
-
|
144
|
-
html = html.gsub( title_regex ) do |m|
|
145
|
-
puts "** found subsection >#{$1}<:"
|
146
|
-
puts " >|#{m}|<"
|
147
|
-
|
148
|
-
"\n@SUBSECTION{#{$1}}\n"
|
149
|
-
end
|
150
|
-
html
|
151
|
-
end
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
def split_sects( html )
|
156
|
-
####
|
157
|
-
# split html in sections (divided by section headings)
|
158
|
-
# e.g. remove optional prolog ??,
|
159
|
-
## [[heading,sect],
|
160
|
-
## [heading,sect],
|
161
|
-
## [heading,sect],...]
|
162
|
-
|
163
|
-
## note: "wrap" regex in a capture group (just one)
|
164
|
-
## String#split will include all catpure groups in the result array
|
165
|
-
|
166
|
-
section_regex= /(@SECTION{.+?})/ ## note: use non-greedy -- check: need to escape {} ??
|
167
|
-
|
168
|
-
chunks = html.split( section_regex )
|
169
|
-
|
170
|
-
## check if first item is a section or (html) prolog
|
171
|
-
# if prolog (remove)
|
172
|
-
chunks.slice!(0) unless chunks[0] =~ /@SECTION/ ## starts w/ @SECTION
|
173
|
-
|
174
|
-
pairs = chunks.each_slice(2).to_a
|
175
|
-
|
176
|
-
## now split subsections
|
177
|
-
newpairs = []
|
178
|
-
pairs.each do |item|
|
179
|
-
## todo: after cleanup prolog; remove @SECTION{} ?? - just keep title - why, why not??
|
180
|
-
newpairs << [item[0], split_subsects( item[1]) ]
|
181
|
-
end
|
182
|
-
newpairs
|
183
|
-
end
|
184
|
-
|
185
|
-
|
186
|
-
def split_subsects( html )
|
187
|
-
####
|
188
|
-
# split html in subsections (divided by subsection headings)
|
189
|
-
# e.g. remove optional prolog ??,
|
190
|
-
## [[heading,sect],
|
191
|
-
## [heading,sect],
|
192
|
-
## [heading,sect],...]
|
193
|
-
|
194
|
-
## note: "wrap" regex in a capture group (just one)
|
195
|
-
## String#split will include all catpure groups in the result array
|
196
|
-
|
197
|
-
subsection_regex= /(@SUBSECTION{.+?})/ ## note: use non-greedy -- check: need to escape {} ??
|
198
|
-
|
199
|
-
chunks = html.split( subsection_regex )
|
200
|
-
|
201
|
-
## check if first item is a section or (html) prolog
|
202
|
-
# if prolog (remove)
|
203
|
-
chunks.slice!(0) unless chunks[0] =~ /@SUBSECTION/ ## starts w/ @SUBSECTION
|
204
|
-
|
205
|
-
pairs = chunks.each_slice(2).to_a
|
206
|
-
pairs
|
207
|
-
end
|
208
|
-
|
209
|
-
end # class Builder
|
210
|
-
|
211
|
-
|
212
|
-
end # module Factbook
|
@@ -1,126 +0,0 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
|
-
module Factbook
|
4
|
-
|
5
|
-
class ItemBuilder ## renameto ItemReader, ItemParser - why? why not??
|
6
|
-
include LogUtils::Logging
|
7
|
-
include NormalizeHelper ## e.g. normalize_category
|
8
|
-
|
9
|
-
def initialize( html, name )
|
10
|
-
@html = html
|
11
|
-
@name = name # add category/field name e.g. Area, Location, etc.
|
12
|
-
end
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
##
|
17
|
-
## <div class="category_data subfield text">
|
18
|
-
## Portuguese (official and most widely spoken language)
|
19
|
-
##
|
20
|
-
## </div>
|
21
|
-
## <div class="category_data note">
|
22
|
-
## <p><strong>note:</strong> less common languages include Spanish (border areas and schools), German, Italian, Japanese, English, and a large number of minor Amerindian languages</p>
|
23
|
-
## </div>
|
24
|
-
|
25
|
-
|
26
|
-
def read
|
27
|
-
## return hash from html snippet
|
28
|
-
doc = Nokogiri::HTML.fragment( @html )
|
29
|
-
|
30
|
-
data = {}
|
31
|
-
|
32
|
-
## note:
|
33
|
-
## skip whitespace text nodes (e.g. \n\n etc); just use divs
|
34
|
-
doc_children = doc.children.filter('div')
|
35
|
-
|
36
|
-
puts " parsing >#{@name}< - #{doc_children.size} category_data divs(s):"
|
37
|
-
|
38
|
-
doc_children.each_with_index do |div,i|
|
39
|
-
if div['class'].index( 'note' )
|
40
|
-
text = squish( div.text.strip )
|
41
|
-
puts "category_data: >#{text}<"
|
42
|
-
|
43
|
-
data['note'] = { 'text' => text }
|
44
|
-
elsif div['class'].index( 'historic' )
|
45
|
-
## add all historic together into one for now
|
46
|
-
text = squish( div.text.strip )
|
47
|
-
puts "category_data: >#{text}<"
|
48
|
-
|
49
|
-
if i == 0
|
50
|
-
data['text'] = text
|
51
|
-
else
|
52
|
-
## append with / for now
|
53
|
-
data['text'] += " / #{text}"
|
54
|
-
end
|
55
|
-
elsif div.css( 'span.subfield-name').empty?
|
56
|
-
## assume "implied text field"
|
57
|
-
## check for index == 1 / child count == 1 - why? why not
|
58
|
-
text = squish( div.text.strip ) ## fix/todo: use strip
|
59
|
-
puts "category_data: >#{text}<"
|
60
|
-
|
61
|
-
data['text'] = text
|
62
|
-
|
63
|
-
## must be always first node for now
|
64
|
-
if i != 0
|
65
|
-
puts "!! ERROR - 'implied' category W/O name NOT first div / node:"
|
66
|
-
puts @html
|
67
|
-
exit 1
|
68
|
-
end
|
69
|
-
elsif div['class'].index( 'grouped_subfield' )
|
70
|
-
## split grouped subfield!!
|
71
|
-
## <span class="subfield-name">arable land:</span>
|
72
|
-
## <span class="subfield-number">8.6%</span>
|
73
|
-
## <span class="subfield-date">(2011 est.)</span>
|
74
|
-
## /
|
75
|
-
## <span class="subfield-name">permanent crops:</span>
|
76
|
-
## <span class="subfield-number">0.8%</span>
|
77
|
-
## <span class="subfield-date">(2011 est.)</span>
|
78
|
-
## /
|
79
|
-
## <span class="subfield-name">permanent pasture:</span>
|
80
|
-
## <span class="subfield-number">23.5%</span>
|
81
|
-
## <span class="subfield-date">(2011 est.)</span>
|
82
|
-
|
83
|
-
## join names for now - why? why not?
|
84
|
-
## e.g. becomes:
|
85
|
-
## arable land / permanent crops / permanent pasture: for key ??
|
86
|
-
span_names = div.css( 'span.subfield-name')
|
87
|
-
keys = []
|
88
|
-
span_names.each do |span|
|
89
|
-
keys << normalize_category( span.text.strip )
|
90
|
-
span.replace( '' )
|
91
|
-
end
|
92
|
-
key = keys.join( ' / ')
|
93
|
-
text = squish( div.text.strip )
|
94
|
-
puts "category_data key >#{key}<: >#{text}<"
|
95
|
-
data[ key ] = { 'text' => text }
|
96
|
-
else
|
97
|
-
## get subfield name
|
98
|
-
span_names = div.css( 'span.subfield-name')
|
99
|
-
if span_names.size > 1
|
100
|
-
puts "!! ERROR - found more than one subfield-name:"
|
101
|
-
puts div.to_html
|
102
|
-
exit 1
|
103
|
-
end
|
104
|
-
key = normalize_category( span_names[0].text.strip )
|
105
|
-
span_names[0].replace( '' )
|
106
|
-
|
107
|
-
text = squish( div.text.strip )
|
108
|
-
puts "category_data key >#{key}<: >#{text}<"
|
109
|
-
data[ key ] = { 'text' => text }
|
110
|
-
end
|
111
|
-
end
|
112
|
-
|
113
|
-
|
114
|
-
pp data
|
115
|
-
data
|
116
|
-
end
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
def squish( str )
|
121
|
-
str.gsub( /[ \t\n\r]{2,}/, ' ') ## replace multi-spaces (incl. newlines with once space)
|
122
|
-
end
|
123
|
-
|
124
|
-
end # class ItemBuilder
|
125
|
-
|
126
|
-
end # module Factbook
|