factbook-readers 1.0.1 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/Manifest.txt +3 -25
- data/README.md +11 -69
- data/Rakefile +3 -3
- data/lib/factbook-readers.rb +5 -40
- data/lib/factbook-readers/convert.rb +37 -0
- data/lib/factbook-readers/counter.rb +7 -9
- data/lib/factbook-readers/page.rb +41 -61
- data/lib/factbook-readers/page_info.rb +15 -3
- data/lib/factbook-readers/version.rb +2 -2
- data/test/helper.rb +3 -0
- data/test/test_counter.rb +9 -6
- data/test/test_download.rb +27 -0
- data/test/test_fields.rb +44 -27
- data/test/test_json.rb +4 -4
- data/test/test_page.rb +8 -8
- data/test/test_version.rb +15 -0
- metadata +11 -48
- data/data/categories.csv +0 -164
- data/data/codes.csv +0 -262
- data/data/codesxref.csv +0 -280
- data/data/comparisons.csv +0 -75
- data/lib/factbook-readers/builder.rb +0 -187
- data/lib/factbook-readers/builder_item.rb +0 -201
- data/lib/factbook-readers/builder_json.rb +0 -68
- data/lib/factbook-readers/codes.rb +0 -121
- data/lib/factbook-readers/comparisons.rb +0 -49
- data/lib/factbook-readers/normalize.rb +0 -42
- data/lib/factbook-readers/reader_json.rb +0 -50
- data/lib/factbook-readers/sanitizer.rb +0 -351
- data/lib/factbook-readers/sect.rb +0 -28
- data/lib/factbook-readers/subsect.rb +0 -17
- data/lib/factbook-readers/table.rb +0 -51
- data/lib/factbook-readers/utils.rb +0 -47
- data/lib/factbook-readers/utils_info.rb +0 -128
- data/test/test_builder.rb +0 -30
- data/test/test_codes.rb +0 -72
- data/test/test_comparisons.rb +0 -16
- data/test/test_item_builder.rb +0 -97
- data/test/test_json_builder.rb +0 -23
- data/test/test_normalize.rb +0 -21
- data/test/test_sanitizer.rb +0 -36
- data/test/test_sanitizer_regex.rb +0 -87
data/data/comparisons.csv
DELETED
@@ -1,75 +0,0 @@
|
|
1
|
-
Num,Category,Name
|
2
|
-
2147,Geography,Area
|
3
|
-
2119,People and Society,Population
|
4
|
-
2002,People and Society,Population growth rate
|
5
|
-
2054,People and Society,Birth rate
|
6
|
-
2066,People and Society,Death rate
|
7
|
-
2112,People and Society,Net migration rate
|
8
|
-
2223,People and Society,Maternal mortality rate
|
9
|
-
2091,People and Society,Infant mortality rate
|
10
|
-
2102,People and Society,Life expectancy at birth
|
11
|
-
2127,People and Society,Total fertility rate
|
12
|
-
2225,People and Society,Health expenditures
|
13
|
-
2155,People and Society,HIV/AIDS - adult prevalence rate
|
14
|
-
2156,People and Society,HIV/AIDS - people living with HIV/AIDS
|
15
|
-
2157,People and Society,HIV/AIDS - deaths
|
16
|
-
2228,People and Society,Obesity - adult prevalence rate
|
17
|
-
2224,People and Society,Children under the age of 5 years underweight
|
18
|
-
2206,People and Society,Education expenditures
|
19
|
-
2229,People and Society,"Unemployment, youth ages 15-24"
|
20
|
-
2001,Economy,GDP (purchasing power parity)
|
21
|
-
2003,Economy,GDP - real growth rate
|
22
|
-
2004,Economy,GDP - per capita (PPP)
|
23
|
-
2260,Economy,Gross national saving
|
24
|
-
2089,Economy,Industrial production growth rate
|
25
|
-
2095,Economy,Labor force
|
26
|
-
2129,Economy,Unemployment rate
|
27
|
-
2172,Economy,Distribution of family income - Gini index
|
28
|
-
2221,Economy,Taxes and other revenues
|
29
|
-
2222,Economy,Budget surplus (+) or deficit (-)
|
30
|
-
2186,Economy,Public debt
|
31
|
-
2092,Economy,Inflation rate (consumer prices)
|
32
|
-
2207,Economy,Central bank discount rate
|
33
|
-
2208,Economy,Commercial bank prime lending rate
|
34
|
-
2214,Economy,Stock of narrow money
|
35
|
-
2215,Economy,Stock of broad money
|
36
|
-
2211,Economy,Stock of domestic credit
|
37
|
-
2200,Economy,Market value of publicly traded shares
|
38
|
-
2187,Economy,Current account balance
|
39
|
-
2078,Economy,Exports
|
40
|
-
2087,Economy,Imports
|
41
|
-
2188,Economy,Reserves of foreign exchange and gold
|
42
|
-
2079,Economy,Debt - external
|
43
|
-
2198,Economy,Stock of direct foreign investment - at home
|
44
|
-
2199,Economy,Stock of direct foreign investment - abroad
|
45
|
-
2232,Energy,Electricity - production
|
46
|
-
2233,Energy,Electricity - consumption
|
47
|
-
2234,Energy,Electricity - exports
|
48
|
-
2235,Energy,Electricity - imports
|
49
|
-
2236,Energy,Electricity - installed generating capacity
|
50
|
-
2237,Energy,Electricity - from fossil fuels
|
51
|
-
2239,Energy,Electricity - from nuclear fuels
|
52
|
-
2238,Energy,Electricity - from hydroelectric plants
|
53
|
-
2240,Energy,Electricity - from other renewable sources
|
54
|
-
2241,Energy,Crude oil - production
|
55
|
-
2242,Energy,Crude oil - exports
|
56
|
-
2243,Energy,Crude oil - imports
|
57
|
-
2244,Energy,Crude oil - proved reserves
|
58
|
-
2245,Energy,Refined petroleum products - production
|
59
|
-
2246,Energy,Refined petroleum products - consumption
|
60
|
-
2247,Energy,Refined petroleum products - exports
|
61
|
-
2248,Energy,Refined petroleum products - imports
|
62
|
-
2249,Energy,Natural gas - production
|
63
|
-
2250,Energy,Natural gas - consumption
|
64
|
-
2251,Energy,Natural gas - exports
|
65
|
-
2252,Energy,Natural gas - imports
|
66
|
-
2253,Energy,Natural gas - proved reserves
|
67
|
-
2150,Communications,Telephones - fixed lines
|
68
|
-
2151,Communications,Telephones - mobile cellular
|
69
|
-
2153,Communications,Internet users
|
70
|
-
2053,Transportation,Airports
|
71
|
-
2121,Transportation,Railways
|
72
|
-
2085,Transportation,Roadways
|
73
|
-
2093,Transportation,Waterways
|
74
|
-
2108,Transportation,Merchant marine
|
75
|
-
2034,Military,Military expenditures
|
@@ -1,187 +0,0 @@
|
|
1
|
-
|
2
|
-
module Factbook
|
3
|
-
|
4
|
-
class Builder ## todo: change to HtmlBuilder or PageBuilder ???
|
5
|
-
include LogUtils::Logging
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
attr_reader :html_original, ## full "original" 1:1 page
|
10
|
-
:html, ## cut-out and sanitized profile
|
11
|
-
:html_debug, ## html w/ mapping markers - rename to html_markers - why? why not?
|
12
|
-
:info, ## page info incl. country_name, region_name, last_updated etc.
|
13
|
-
:errors, ## encoding erros etc.
|
14
|
-
:sects
|
15
|
-
|
16
|
-
|
17
|
-
def initialize( html_original )
|
18
|
-
@html_original = html_original
|
19
|
-
|
20
|
-
@html, @info, @errors = Sanitizer.new.sanitize( @html_original )
|
21
|
-
|
22
|
-
|
23
|
-
html_sects = if @html.empty?
|
24
|
-
## note: support "empty" pages - old format waiting for update!!!
|
25
|
-
## cannot parse for now
|
26
|
-
@html_debug = ''
|
27
|
-
[] ## return empty (no) sections for now - sorry (its just one page with code cc anyway!!)
|
28
|
-
else
|
29
|
-
@html_debug = map_sects( @html )
|
30
|
-
@html_debug = map_subsects( @html_debug )
|
31
|
-
|
32
|
-
split_sects( @html_debug )
|
33
|
-
end
|
34
|
-
|
35
|
-
pp html_sects
|
36
|
-
|
37
|
-
## debug
|
38
|
-
## File.open( 'tmp/br.debug.html', 'w:utf-8') { |f| f.write( @html_debug ) }
|
39
|
-
|
40
|
-
|
41
|
-
@sects = []
|
42
|
-
html_sects.each do |html_sect|
|
43
|
-
html_sect_head = html_sect[0]
|
44
|
-
html_subsects = html_sect[1]
|
45
|
-
puts html_sect_head
|
46
|
-
puts html_subsects.size
|
47
|
-
|
48
|
-
## get section title
|
49
|
-
## @SECTION{Economy} => Economy
|
50
|
-
if html_sect_head =~ /@SECTION{(.+?)}/
|
51
|
-
title = $1.strip
|
52
|
-
puts title
|
53
|
-
sect = Sect.new
|
54
|
-
sect.title = title
|
55
|
-
## get subsections
|
56
|
-
subsects = []
|
57
|
-
html_subsects.each do |html_subsect|
|
58
|
-
html_subsect_head = html_subsect[0]
|
59
|
-
html_subsect_body = html_subsect[1]
|
60
|
-
if html_subsect_head =~ /@SUBSECTION{(.+?)}/
|
61
|
-
title = $1.strip
|
62
|
-
title = title.sub( /:\z/, '' ) # remove trailing : if present
|
63
|
-
title = title.strip
|
64
|
-
|
65
|
-
puts title
|
66
|
-
subsect = Subsect.new
|
67
|
-
subsect.title = title ## todo/fix: cut off trailing colon (:)
|
68
|
-
|
69
|
-
b = Factbook::ItemBuilder.new( html_subsect_body, title )
|
70
|
-
h = b.read
|
71
|
-
subsect.data = h
|
72
|
-
|
73
|
-
subsects << subsect
|
74
|
-
else
|
75
|
-
## warn/fix: no subsection title found
|
76
|
-
end
|
77
|
-
end
|
78
|
-
sect.subsects = subsects
|
79
|
-
@sects << sect
|
80
|
-
else
|
81
|
-
## warn/fix: no section title found
|
82
|
-
end
|
83
|
-
end
|
84
|
-
end
|
85
|
-
|
86
|
-
|
87
|
-
H2_RE = /<h2>
|
88
|
-
\s*
|
89
|
-
(.+?) ## note: use non-greedy; do NOT allow tags inside for now
|
90
|
-
\s*
|
91
|
-
<\/h2>
|
92
|
-
/xim
|
93
|
-
|
94
|
-
def map_sects( html )
|
95
|
-
## convert section titles to "unified" marker
|
96
|
-
## e.g.
|
97
|
-
## <h2>Introduction</h2>
|
98
|
-
|
99
|
-
html = html.gsub( H2_RE ) do |m|
|
100
|
-
puts "** found section >#{$1}<:"
|
101
|
-
puts " >|#{m}|<"
|
102
|
-
|
103
|
-
"\n\n@SECTION{#{$1}}\n\n"
|
104
|
-
end
|
105
|
-
html
|
106
|
-
end
|
107
|
-
|
108
|
-
|
109
|
-
H3_RE = /<h3>
|
110
|
-
\s*
|
111
|
-
(.+?) ## note: use non-greedy; allows tags inside - why? why not
|
112
|
-
\s*
|
113
|
-
<\/h3>
|
114
|
-
/xim
|
115
|
-
|
116
|
-
def map_subsects( html )
|
117
|
-
## convert subsection titles to "unified" marker
|
118
|
-
## e.g.
|
119
|
-
## <h3>Disputes - international:</h3>
|
120
|
-
|
121
|
-
html = html.gsub( H3_RE ) do |m|
|
122
|
-
puts "** found subsection >#{$1}<:"
|
123
|
-
puts " >|#{m}|<"
|
124
|
-
|
125
|
-
"\n@SUBSECTION{#{$1}}\n"
|
126
|
-
end
|
127
|
-
html
|
128
|
-
end
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
def split_sects( html )
|
133
|
-
####
|
134
|
-
# split html in sections (divided by section headings)
|
135
|
-
# e.g. remove optional prolog ??,
|
136
|
-
## [[heading,sect],
|
137
|
-
## [heading,sect],
|
138
|
-
## [heading,sect],...]
|
139
|
-
|
140
|
-
## note: "wrap" regex in a capture group (just one)
|
141
|
-
## String#split will include all catpure groups in the result array
|
142
|
-
|
143
|
-
## note: use non-greedy -- check: need to escape {} ??
|
144
|
-
chunks = html.split( /(@SECTION{.+?})/ )
|
145
|
-
|
146
|
-
## check if first item is a section or (html) prolog
|
147
|
-
# if prolog (remove)
|
148
|
-
chunks.slice!(0) unless chunks[0] =~ /@SECTION/ ## starts w/ @SECTION
|
149
|
-
|
150
|
-
pairs = chunks.each_slice(2).to_a
|
151
|
-
|
152
|
-
## now split subsections
|
153
|
-
newpairs = []
|
154
|
-
pairs.each do |item|
|
155
|
-
## todo: after cleanup prolog; remove @SECTION{} ?? - just keep title - why, why not??
|
156
|
-
newpairs << [item[0], split_subsects( item[1]) ]
|
157
|
-
end
|
158
|
-
newpairs
|
159
|
-
end
|
160
|
-
|
161
|
-
|
162
|
-
def split_subsects( html )
|
163
|
-
####
|
164
|
-
# split html in subsections (divided by subsection headings)
|
165
|
-
# e.g. remove optional prolog ??,
|
166
|
-
## [[heading,sect],
|
167
|
-
## [heading,sect],
|
168
|
-
## [heading,sect],...]
|
169
|
-
|
170
|
-
## note: "wrap" regex in a capture group (just one)
|
171
|
-
## String#split will include all catpure groups in the result array
|
172
|
-
|
173
|
-
## note: use non-greedy -- check: need to escape {} ??
|
174
|
-
chunks = html.split( /(@SUBSECTION{.+?})/ )
|
175
|
-
|
176
|
-
## check if first item is a section or (html) prolog
|
177
|
-
# if prolog (remove)
|
178
|
-
chunks.slice!(0) unless chunks[0] =~ /@SUBSECTION/ ## starts w/ @SUBSECTION
|
179
|
-
|
180
|
-
pairs = chunks.each_slice(2).to_a
|
181
|
-
pairs
|
182
|
-
end
|
183
|
-
|
184
|
-
end # class Builder
|
185
|
-
|
186
|
-
|
187
|
-
end # module Factbook
|
@@ -1,201 +0,0 @@
|
|
1
|
-
|
2
|
-
module Factbook
|
3
|
-
|
4
|
-
class ItemBuilder ## renameto ItemReader, ItemParser - why? why not??
|
5
|
-
include LogUtils::Logging
|
6
|
-
include NormalizeHelper ## e.g. normalize_category
|
7
|
-
|
8
|
-
def initialize( html, name )
|
9
|
-
@html = html
|
10
|
-
@name = name # add category/field name e.g. Area, Location, etc.
|
11
|
-
end
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
##
|
16
|
-
## <div class="category_data subfield text">
|
17
|
-
## Portuguese (official and most widely spoken language)
|
18
|
-
##
|
19
|
-
## </div>
|
20
|
-
## <div class="category_data note">
|
21
|
-
## <p><strong>note:</strong> less common languages include Spanish (border areas and schools), German, Italian, Japanese, English, and a large number of minor Amerindian languages</p>
|
22
|
-
## </div>
|
23
|
-
|
24
|
-
|
25
|
-
def read
|
26
|
-
## return hash from html snippet
|
27
|
-
doc = Nokogiri::HTML.fragment( @html )
|
28
|
-
|
29
|
-
data = {}
|
30
|
-
|
31
|
-
## note:
|
32
|
-
## skip whitespace text nodes (e.g. \n\n etc); just use divs
|
33
|
-
doc_children = doc.children.filter('div')
|
34
|
-
|
35
|
-
puts " parsing >#{@name}< - #{doc_children.size} category_data divs(s):"
|
36
|
-
|
37
|
-
## hanlde special case for
|
38
|
-
## multiple 'grouped_subfield' first
|
39
|
-
## e.g. used in
|
40
|
-
## - Drinking water source:
|
41
|
-
## - Sanitation facility access:
|
42
|
-
|
43
|
-
grouped_children = []
|
44
|
-
other_children = []
|
45
|
-
|
46
|
-
doc_children.each do |div|
|
47
|
-
if div['class'] && div['class'].index( 'grouped_subfield' )
|
48
|
-
grouped_children << div
|
49
|
-
else
|
50
|
-
other_children << div
|
51
|
-
end
|
52
|
-
end
|
53
|
-
|
54
|
-
|
55
|
-
## note: only use special rule if more than one div marked grouped_
|
56
|
-
if grouped_children.size > 1
|
57
|
-
## continue processing the rest as usual
|
58
|
-
doc_children = other_children
|
59
|
-
|
60
|
-
key = nil
|
61
|
-
grouped_children.each do |div|
|
62
|
-
if !div.css( 'span.subfield-group').empty?
|
63
|
-
# start a new group
|
64
|
-
span_group = div.at( 'span.subfield-group')
|
65
|
-
key = normalize_category( span_group.text.strip )
|
66
|
-
span_group.replace( '' )
|
67
|
-
|
68
|
-
text = squish( div.text.strip )
|
69
|
-
puts "new group - category_data key >#{key}<: >#{text}<"
|
70
|
-
data[ key ] = { 'text' => text }
|
71
|
-
else
|
72
|
-
## append to (last) group
|
73
|
-
text = squish( div.text.strip )
|
74
|
-
puts "add group - category_data key >#{key}<: >#{text}<"
|
75
|
-
data[ key ]['text'] += " / #{text}"
|
76
|
-
end
|
77
|
-
end
|
78
|
-
end
|
79
|
-
|
80
|
-
|
81
|
-
doc_children.each_with_index do |div,i|
|
82
|
-
if div['class'] && div['class'].index( 'category_data' )
|
83
|
-
if div['class'].index( 'note' )
|
84
|
-
text = squish( div.text.strip )
|
85
|
-
puts "category_data: >#{text}<"
|
86
|
-
|
87
|
-
## note: for now only allow one note per subsection/field data block
|
88
|
-
if data['note']
|
89
|
-
puts "!! ERROR: note already taken:"
|
90
|
-
puts data['note']
|
91
|
-
puts div.to_html
|
92
|
-
exit 1
|
93
|
-
end
|
94
|
-
|
95
|
-
## note: add note directly (that is, W/O extra hash and text node/key)
|
96
|
-
data['note'] = text
|
97
|
-
elsif div['class'].index( 'historic' )
|
98
|
-
## add all historic together into one for now
|
99
|
-
text = squish( div.text.strip )
|
100
|
-
puts "category_data: >#{text}<"
|
101
|
-
|
102
|
-
if data['text']
|
103
|
-
## append with / for now
|
104
|
-
data['text'] += " / #{text}"
|
105
|
-
else
|
106
|
-
data['text'] = text
|
107
|
-
## check if history is first node
|
108
|
-
if i != 0
|
109
|
-
puts "!! ERROR: expected first historic node to be first node but it is #{i+1}:"
|
110
|
-
puts div.to_html
|
111
|
-
exit 1
|
112
|
-
end
|
113
|
-
end
|
114
|
-
elsif div.css( 'span.subfield-name').empty?
|
115
|
-
## assume "implied text field"
|
116
|
-
## check for index == 1 / child count == 1 - why? why not
|
117
|
-
text = squish( div.text.strip ) ## fix/todo: use strip
|
118
|
-
puts "category_data: >#{text}<"
|
119
|
-
|
120
|
-
data['text'] = text
|
121
|
-
|
122
|
-
## must be always first node for now
|
123
|
-
if i != 0
|
124
|
-
puts "!! ERROR - 'implied' category W/O name NOT first div / node:"
|
125
|
-
puts div.to_html
|
126
|
-
exit 1
|
127
|
-
end
|
128
|
-
elsif div['class'].index( 'grouped_subfield' )
|
129
|
-
## split grouped subfield!!
|
130
|
-
## <span class="subfield-name">arable land:</span>
|
131
|
-
## <span class="subfield-number">8.6%</span>
|
132
|
-
## <span class="subfield-date">(2011 est.)</span>
|
133
|
-
## /
|
134
|
-
## <span class="subfield-name">permanent crops:</span>
|
135
|
-
## <span class="subfield-number">0.8%</span>
|
136
|
-
## <span class="subfield-date">(2011 est.)</span>
|
137
|
-
## /
|
138
|
-
## <span class="subfield-name">permanent pasture:</span>
|
139
|
-
## <span class="subfield-number">23.5%</span>
|
140
|
-
## <span class="subfield-date">(2011 est.)</span>
|
141
|
-
|
142
|
-
## join names for now - why? why not?
|
143
|
-
## e.g. becomes:
|
144
|
-
## arable land / permanent crops / permanent pasture: for key ??
|
145
|
-
span_names = div.css( 'span.subfield-name')
|
146
|
-
keys = []
|
147
|
-
span_names.each do |span|
|
148
|
-
keys << normalize_category( span.text.strip )
|
149
|
-
span.replace( '' )
|
150
|
-
end
|
151
|
-
key = keys.join( ' / ')
|
152
|
-
text = squish( div.text.strip )
|
153
|
-
puts "category_data key >#{key}<: >#{text}<"
|
154
|
-
data[ key ] = { 'text' => text }
|
155
|
-
else
|
156
|
-
## get subfield name
|
157
|
-
span_names = div.css( 'span.subfield-name')
|
158
|
-
if span_names.size > 1
|
159
|
-
puts "!! ERROR - found more than one subfield-name:"
|
160
|
-
puts div.to_html
|
161
|
-
exit 1
|
162
|
-
end
|
163
|
-
key = normalize_category( span_names[0].text.strip )
|
164
|
-
span_names[0].replace( '' )
|
165
|
-
|
166
|
-
text = squish( div.text.strip )
|
167
|
-
puts "category_data key >#{key}<: >#{text}<"
|
168
|
-
data[ key ] = { 'text' => text }
|
169
|
-
end
|
170
|
-
else
|
171
|
-
text = squish( div.text.strip )
|
172
|
-
if text =~ /country\s+
|
173
|
-
comparison\s+
|
174
|
-
to\s+
|
175
|
-
the\s+
|
176
|
-
world:\s+
|
177
|
-
([0-9]+)/xim
|
178
|
-
data[ 'country comparison to the world' ] = $1.to_i
|
179
|
-
else
|
180
|
-
puts "!! ERROR: div (W/O category_data class):"
|
181
|
-
puts div.to_html
|
182
|
-
exit 1
|
183
|
-
end
|
184
|
-
end
|
185
|
-
end
|
186
|
-
|
187
|
-
|
188
|
-
pp data
|
189
|
-
data
|
190
|
-
end
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
def squish( str )
|
196
|
-
str.gsub( /[ \t\n\r]{2,}/, ' ') ## replace multi-spaces (incl. newlines with once space)
|
197
|
-
end
|
198
|
-
|
199
|
-
end # class ItemBuilder
|
200
|
-
|
201
|
-
end # module Factbook
|