factbook-readers 1.0.0 → 1.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +6 -8
- data/lib/factbook-readers/builder_item.rb +20 -4
- data/lib/factbook-readers/codes.rb +0 -2
- data/lib/factbook-readers/comparisons.rb +0 -1
- data/lib/factbook-readers/counter.rb +5 -6
- data/lib/factbook-readers/normalize.rb +3 -4
- data/lib/factbook-readers/page_info.rb +0 -1
- data/lib/factbook-readers/reader_json.rb +4 -5
- data/lib/factbook-readers/sanitizer.rb +76 -11
- data/lib/factbook-readers/sect.rb +3 -4
- data/lib/factbook-readers/subsect.rb +0 -1
- data/lib/factbook-readers/table.rb +7 -8
- data/lib/factbook-readers/utils_info.rb +0 -1
- data/lib/factbook-readers/version.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 2bc67eb2f60367d8d0ef00ca718c7d8b81b4a9c8
|
4
|
+
data.tar.gz: f61389d6a073db31e79766c2711bbabb89b27699
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 3a565e36afae190e18154bc366bbd3d1a77f06e0c51017ba34893448fd2588fa57e2c93647ef3de5338b423e63787ef248a56c512b67504a522122bb4b24e0ff
|
7
|
+
data.tar.gz: 1cd6b487cb5fb2a2c5b659d2dacf0481ff5368f2e85f977c145bcf46e94f16d0543bbb59dd61995fb1b137c3bb4654308119c89b6ffa7883b0023684101b17dc
|
data/README.md
CHANGED
@@ -33,20 +33,18 @@ resulting in:
|
|
33
33
|
{"total"=>{"text"=>"8,515,770 sq km"},
|
34
34
|
"land"=>{"text"=>"8,358,140 sq km"},
|
35
35
|
"water"=>{"text"=>"157,630 sq km"},
|
36
|
-
"note"=>
|
37
|
-
{"text"=>
|
38
|
-
"includes Arquipelago de Fernando de Noronha, Atol das Rocas, ..."}},
|
36
|
+
"note"=> "includes Arquipelago de Fernando de Noronha, Atol das Rocas, ..."},
|
39
37
|
"Area - comparative"=>
|
40
38
|
{"text"=>"slightly smaller than the US"},
|
41
39
|
"Land boundaries"=>
|
42
40
|
{"total"=>{"text"=>"16,145 km"},
|
43
|
-
"border countries
|
41
|
+
"border countries"=>
|
44
42
|
{"text"=>
|
45
43
|
"Argentina 1,263 km, Bolivia 3,403 km, Colombia 1,790 km,
|
46
44
|
French Guiana 649 km, Guyana 1,308 km, Paraguay 1,371 km, Peru 2,659 km,
|
47
45
|
Suriname 515 km, Uruguay 1,050 km, Venezuela 2,137 km"}},
|
48
46
|
"Climate"=>{"text"=>"mostly tropical, but temperate in south"},
|
49
|
-
"Elevation
|
47
|
+
"Elevation"=>
|
50
48
|
{"lowest point"=>{"text"=>"Atlantic Ocean 0 m"},
|
51
49
|
"highest point"=>{"text"=>"Pico da Neblina 2,994 m"}},
|
52
50
|
"Natural resources"=>
|
@@ -66,7 +64,7 @@ pp page['Geography']['Area']['land']['text']
|
|
66
64
|
# => "8,358,140 sq km"
|
67
65
|
pp page['Geography']['Area']['water']['text']
|
68
66
|
# => "157,630 sq km"
|
69
|
-
pp page['Geography']['Area']['note']
|
67
|
+
pp page['Geography']['Area']['note']
|
70
68
|
# => "includes Arquipelago de Fernando de Noronha, Atol das Rocas, ..."
|
71
69
|
pp page['Geography']['Area - comparative']['text']
|
72
70
|
# => "slightly smaller than the US"
|
@@ -74,9 +72,9 @@ pp page['Geography']['Climate']['text']
|
|
74
72
|
# => "mostly tropical, but temperate in south"
|
75
73
|
pp page['Geography']['Terrain']['text']
|
76
74
|
# => "mostly flat to rolling lowlands in north; ..."
|
77
|
-
pp page['Geography']['Elevation
|
75
|
+
pp page['Geography']['Elevation']['lowest point']['text']
|
78
76
|
# => "Atlantic Ocean 0 m"
|
79
|
-
pp page['Geography']['Elevation
|
77
|
+
pp page['Geography']['Elevation']['highest point']['text']
|
80
78
|
# => "Pico da Neblina 2,994 m"
|
81
79
|
pp page['Geography']['Natural resources']['text']
|
82
80
|
# => "bauxite, gold, iron ore, manganese, nickel, phosphates, ..."
|
@@ -1,4 +1,3 @@
|
|
1
|
-
# encoding: utf-8
|
2
1
|
|
3
2
|
module Factbook
|
4
3
|
|
@@ -45,7 +44,7 @@ def read
|
|
45
44
|
other_children = []
|
46
45
|
|
47
46
|
doc_children.each do |div|
|
48
|
-
if div['class'].index( 'grouped_subfield' )
|
47
|
+
if div['class'] && div['class'].index( 'grouped_subfield' )
|
49
48
|
grouped_children << div
|
50
49
|
else
|
51
50
|
other_children << div
|
@@ -79,7 +78,8 @@ def read
|
|
79
78
|
end
|
80
79
|
|
81
80
|
|
82
|
-
|
81
|
+
doc_children.each_with_index do |div,i|
|
82
|
+
if div['class'] && div['class'].index( 'category_data' )
|
83
83
|
if div['class'].index( 'note' )
|
84
84
|
text = squish( div.text.strip )
|
85
85
|
puts "category_data: >#{text}<"
|
@@ -92,7 +92,8 @@ def read
|
|
92
92
|
exit 1
|
93
93
|
end
|
94
94
|
|
95
|
-
|
95
|
+
## note: add note directly (that is, W/O extra hash and text node/key)
|
96
|
+
data['note'] = text
|
96
97
|
elsif div['class'].index( 'historic' )
|
97
98
|
## add all historic together into one for now
|
98
99
|
text = squish( div.text.strip )
|
@@ -166,7 +167,22 @@ def read
|
|
166
167
|
puts "category_data key >#{key}<: >#{text}<"
|
167
168
|
data[ key ] = { 'text' => text }
|
168
169
|
end
|
170
|
+
else
|
171
|
+
text = squish( div.text.strip )
|
172
|
+
if text =~ /country\s+
|
173
|
+
comparison\s+
|
174
|
+
to\s+
|
175
|
+
the\s+
|
176
|
+
world:\s+
|
177
|
+
([0-9]+)/xim
|
178
|
+
data[ 'country comparison to the world' ] = $1.to_i
|
179
|
+
else
|
180
|
+
puts "!! ERROR: div (W/O category_data class):"
|
181
|
+
puts div.to_html
|
182
|
+
exit 1
|
183
|
+
end
|
169
184
|
end
|
185
|
+
end
|
170
186
|
|
171
187
|
|
172
188
|
pp data
|
@@ -1,4 +1,3 @@
|
|
1
|
-
# encoding: utf-8
|
2
1
|
|
3
2
|
module Factbook
|
4
3
|
|
@@ -24,20 +23,20 @@ def walk( page, hin, hout )
|
|
24
23
|
hin.each do |k,v|
|
25
24
|
if v.is_a? Hash
|
26
25
|
hout2 = hout[k] || { count: 0, codes: '' }
|
27
|
-
|
26
|
+
|
28
27
|
hout2[ :count ] += 1
|
29
|
-
|
28
|
+
|
30
29
|
## delete codes if larger (treshhold) than x (e.g. 9)
|
31
30
|
hout2.delete( :codes ) if hout2[ :count ] > 9
|
32
31
|
|
33
32
|
codes = hout2[ :codes ]
|
34
33
|
if codes ## note: might got deleted if passed treshhold (e.g. 9 entries)
|
35
34
|
codes << ' ' unless codes.empty? ## add separator (space for now)
|
36
|
-
codes << page.info.country_code
|
35
|
+
codes << page.info.country_code
|
37
36
|
hout2[ :codes ] = codes
|
38
37
|
end
|
39
|
-
|
40
|
-
hout[k] = hout2
|
38
|
+
|
39
|
+
hout[k] = hout2
|
41
40
|
walk( page, v, hout2 )
|
42
41
|
end
|
43
42
|
end
|
@@ -1,6 +1,5 @@
|
|
1
|
-
# encoding: utf-8
|
2
1
|
|
3
|
-
module Factbook
|
2
|
+
module Factbook
|
4
3
|
module NormalizeHelper
|
5
4
|
|
6
5
|
|
@@ -17,7 +16,7 @@ def normalize_category( text )
|
|
17
16
|
|
18
17
|
## typos e.g ntoe => use note
|
19
18
|
text = 'note' if text == 'ntoe'
|
20
|
-
text = 'investment in fixed capital' if text == 'investment if fixed capital'
|
19
|
+
text = 'investment in fixed capital' if text == 'investment if fixed capital'
|
21
20
|
|
22
21
|
## downcase
|
23
22
|
text = 'lowest point' if text == 'Lowest point'
|
@@ -34,7 +33,7 @@ def normalize_category( text )
|
|
34
33
|
|
35
34
|
## border countries (8): -- remove (x) counter
|
36
35
|
text = 'border countries' if text.start_with?( 'border countries')
|
37
|
-
|
36
|
+
|
38
37
|
text
|
39
38
|
end
|
40
39
|
|
@@ -1,4 +1,3 @@
|
|
1
|
-
# encoding: utf-8
|
2
1
|
|
3
2
|
module Factbook
|
4
3
|
|
@@ -12,7 +11,7 @@ def read_page( code )
|
|
12
11
|
path = "#{@json_dir}/#{region_to_slug(code.region)}/#{code.code}.json"
|
13
12
|
|
14
13
|
puts "reading #{code.code} #{code.name} (#{code.region}) [#{path}]..."
|
15
|
-
json = File.read( path )
|
14
|
+
json = File.read( path, 'r:utf-8' ) { |f| f.read }
|
16
15
|
|
17
16
|
## todo/fix/quick hack: for now until we have a proper header/meta/info section in json
|
18
17
|
# add some page info from code struct
|
@@ -21,7 +20,7 @@ def read_page( code )
|
|
21
20
|
info.country_code = code.code
|
22
21
|
info.country_name = code.name
|
23
22
|
info.region_name = code.region
|
24
|
-
|
23
|
+
|
25
24
|
page = Page.new( code.code, json: json, info: info )
|
26
25
|
page
|
27
26
|
end
|
@@ -31,8 +30,8 @@ def read_pages( codes, limit: nil )
|
|
31
30
|
i=0
|
32
31
|
codes.each do |code|
|
33
32
|
next if limit && i > limit ## for debugging just process first x entries
|
34
|
-
|
35
|
-
pages << read_page( code )
|
33
|
+
|
34
|
+
pages << read_page( code )
|
36
35
|
end
|
37
36
|
pages
|
38
37
|
end
|
@@ -114,19 +114,65 @@ def find_country_profile( html )
|
|
114
114
|
}
|
115
115
|
puts " #{li_children.size} div(s) in >#{section_title}<:"
|
116
116
|
|
117
|
+
|
118
|
+
## check special case in world Geographic overview:
|
119
|
+
# <div class="category oce_light" style="padding-left:5px;"
|
120
|
+
# id="field-anchor-geography-geographic-overview">
|
121
|
+
# Geographic overview:
|
122
|
+
# <span class="field-listing-link">
|
123
|
+
# <a href="../fields/275.html#XX">
|
124
|
+
# <img alt="Geographic overview field listing"
|
125
|
+
# title="Geographic overview field listing"
|
126
|
+
# src="../images/field_listing.gif" /></a>
|
127
|
+
# </span>
|
128
|
+
#</div>
|
129
|
+
# vs regular
|
130
|
+
#
|
131
|
+
# <div class="category oce_light" style="padding-left:5px;"
|
132
|
+
# id="field-anchor-geography-area-comparative">
|
133
|
+
# <span class="btn-tooltip definition" role="tooltip" aria-hidden='true'>
|
134
|
+
# <a aria-label="Use this link to access a description of the Area - comparative field"
|
135
|
+
# href="../docs/notesanddefs.html#280">
|
136
|
+
# Area - comparative
|
137
|
+
# </a>:
|
138
|
+
# <span class="tooltip-content">
|
139
|
+
# This entry provides an area comparison based on total area equivalents. Most entities are compared with the entire US or one of the 50 states based on area measurements (1990 revised) provided by the US Bureau of the Census. The smaller entities are compared with Washington, DC (178 sq km, 69 sq mi) or The Mall in Washington, DC (0.59 sq km, 0.23 sq mi, 146 acres).
|
140
|
+
# </span>
|
141
|
+
# </span>
|
142
|
+
# <span class="field-listing-link">
|
143
|
+
# <a href="../fields/280.html#XX"><img alt="Area - comparative field listing" title="Area - comparative field listing" src="../images/field_listing.gif" /></a>
|
144
|
+
# </span>
|
145
|
+
# </div>
|
146
|
+
|
117
147
|
li_children.each_slice(2) do |divs|
|
118
148
|
div = divs[0]
|
119
|
-
a = div.css('a')[0]
|
120
149
|
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
puts "!! WARN: no anchor found:"
|
127
|
-
puts div.to_html
|
150
|
+
## try new way - try clean-up / rm first
|
151
|
+
span_tooltip_content = div.at( 'span.tooltip-content' )
|
152
|
+
if span_tooltip_content
|
153
|
+
span_tooltip_content.inner_html = ''
|
154
|
+
span_tooltip_content.replace( '' ) ## check for how to delete/remove - why? why not!!
|
128
155
|
end
|
129
156
|
|
157
|
+
span_field_listing_link = div.at( 'span.field-listing-link' )
|
158
|
+
if span_field_listing_link
|
159
|
+
span_field_listing_link.inner_html = ''
|
160
|
+
span_field_listing_link.replace( '' )
|
161
|
+
end
|
162
|
+
|
163
|
+
subsection_title = div.text.strip
|
164
|
+
html << "\n<h3>#{subsection_title}</h3>\n"
|
165
|
+
|
166
|
+
# a = div.css('a')[0]
|
167
|
+
# if a
|
168
|
+
# subsection_title = a.text ## todo/check/rename: use field_name or such - why? why not?
|
169
|
+
# html << "\n<h3>#{subsection_title}:</h3>\n"
|
170
|
+
# else
|
171
|
+
# subsection_title = '???'
|
172
|
+
# puts "!! WARN: no anchor found:"
|
173
|
+
# puts div.to_html
|
174
|
+
# end
|
175
|
+
|
130
176
|
|
131
177
|
div = divs[1]
|
132
178
|
div_children = div.children.select {|el| el.name == 'div' ? true : false }
|
@@ -157,7 +203,19 @@ def find_country_profile( html )
|
|
157
203
|
end
|
158
204
|
else
|
159
205
|
if catdiv.to_html.index( 'country comparison to the world' )
|
160
|
-
##
|
206
|
+
## simplify/unlinkify country comparision
|
207
|
+
## <div>
|
208
|
+
## <span class='category'>country comparison to the world:</span>
|
209
|
+
## <span class='category_data'>
|
210
|
+
## <a href="../fields/335rank.html#AU">97</a>
|
211
|
+
## </span>
|
212
|
+
## </div>
|
213
|
+
## e.g. to =>
|
214
|
+
## <div>
|
215
|
+
## country comparison to the world: 97
|
216
|
+
## </div>
|
217
|
+
html << "<div>\n #{squish( catdiv.text.strip )}\n</div>"
|
218
|
+
html << "\n"
|
161
219
|
else
|
162
220
|
puts "!! ERROR: div (W/O category_data class) in >#{subsection_title}<:"
|
163
221
|
puts catdiv.to_html
|
@@ -229,6 +287,9 @@ def sanitize_data( el, title: )
|
|
229
287
|
## see fr (france) in political parties section for example
|
230
288
|
## todo/check/fix: check if we need to use unicode char!! and NOT html entity
|
231
289
|
inner_html = inner_html.gsub( " ", ' ' )
|
290
|
+
## Unicode Character 'NO-BREAK SPACE' (U+00A0)
|
291
|
+
inner_html = inner_html.gsub( "\u00A0", ' ' ) ## use unicode char
|
292
|
+
|
232
293
|
|
233
294
|
el.inner_html = inner_html.rstrip + "\n"
|
234
295
|
|
@@ -272,13 +333,17 @@ def sanitize_data( el, title: )
|
|
272
333
|
#####
|
273
334
|
# "unfancy" smart quotes to ascii - why? why not?
|
274
335
|
# e.g.
|
275
|
-
#
|
336
|
+
# Following Britain’s victory => Following Britain's victory
|
276
337
|
html = html.tr( "’", "'" )
|
277
|
-
|
338
|
+
# “full floor” House vote => "full floor" House vote
|
339
|
+
html = html.tr( "“”", '""' )
|
278
340
|
|
279
341
|
html
|
280
342
|
end
|
281
343
|
|
344
|
+
def squish( str )
|
345
|
+
str.gsub( /[ \t\n\r]{2,}/, ' ' ) ## replace multi-spaces (incl. newlines with once space)
|
346
|
+
end
|
282
347
|
|
283
348
|
|
284
349
|
end # class Sanitizer
|
@@ -1,4 +1,3 @@
|
|
1
|
-
# encoding: utf-8
|
2
1
|
|
3
2
|
module Factbook
|
4
3
|
|
@@ -6,9 +5,9 @@ module Factbook
|
|
6
5
|
class Sect
|
7
6
|
include LogUtils::Logging
|
8
7
|
|
9
|
-
attr_accessor :title ## use name instead of title - why? why not?
|
8
|
+
attr_accessor :title ## use name instead of title - why? why not?
|
10
9
|
attr_accessor :subsects
|
11
|
-
|
10
|
+
|
12
11
|
def initialize
|
13
12
|
@subsects = []
|
14
13
|
end
|
@@ -16,7 +15,7 @@ class Sect
|
|
16
15
|
def data
|
17
16
|
## convert sects to hash
|
18
17
|
@data = {}
|
19
|
-
|
18
|
+
|
20
19
|
subsects.each_with_index do |subsect,i|
|
21
20
|
@data[ subsect.title ] = subsect.data
|
22
21
|
end
|
@@ -1,4 +1,3 @@
|
|
1
|
-
# encoding: utf-8
|
2
1
|
|
3
2
|
module Factbook
|
4
3
|
|
@@ -9,12 +8,12 @@ module Factbook
|
|
9
8
|
##
|
10
9
|
## for now reads in rows with values separated by at least 3+ spaces e.g.:
|
11
10
|
## see www.cia.gov/library/publications/the-world-factbook/rankorder/rawdata_2119.txt
|
12
|
-
## 1 China 1,367,485,388
|
13
|
-
## 2 India 1,251,695,584
|
14
|
-
## 3 European Union 513,949,445
|
15
|
-
## 4 United States 321,368,864
|
16
|
-
## 5 Indonesia 255,993,674
|
17
|
-
## 6 Brazil 204,259,812
|
11
|
+
## 1 China 1,367,485,388
|
12
|
+
## 2 India 1,251,695,584
|
13
|
+
## 3 European Union 513,949,445
|
14
|
+
## 4 United States 321,368,864
|
15
|
+
## 5 Indonesia 255,993,674
|
16
|
+
## 6 Brazil 204,259,812
|
18
17
|
|
19
18
|
|
20
19
|
class TableReader
|
@@ -38,7 +37,7 @@ def read
|
|
38
37
|
end
|
39
38
|
|
40
39
|
values = line.split( /[ ]{3,}/ ) ## split three or more spaces - use just two ?? why? why not??
|
41
|
-
|
40
|
+
|
42
41
|
## puts line
|
43
42
|
## pp values
|
44
43
|
recs << values
|