factbook-readers 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/CHANGELOG.md +3 -0
- data/Manifest.txt +56 -0
- data/README.md +196 -0
- data/Rakefile +34 -0
- data/data/attributes.yml +337 -0
- data/data/categories.csv +164 -0
- data/data/codes.csv +262 -0
- data/data/codesxref.csv +280 -0
- data/data/comparisons.csv +75 -0
- data/lib/factbook-readers.rb +59 -0
- data/lib/factbook-readers/attributes.rb +74 -0
- data/lib/factbook-readers/builder.rb +212 -0
- data/lib/factbook-readers/builder_item.rb +185 -0
- data/lib/factbook-readers/builder_json.rb +79 -0
- data/lib/factbook-readers/codes.rb +122 -0
- data/lib/factbook-readers/comparisons.rb +50 -0
- data/lib/factbook-readers/counter.rb +48 -0
- data/lib/factbook-readers/normalize.rb +43 -0
- data/lib/factbook-readers/page.rb +148 -0
- data/lib/factbook-readers/page_info.rb +12 -0
- data/lib/factbook-readers/reader_json.rb +51 -0
- data/lib/factbook-readers/sanitizer.rb +307 -0
- data/lib/factbook-readers/sect.rb +29 -0
- data/lib/factbook-readers/subsect.rb +18 -0
- data/lib/factbook-readers/table.rb +52 -0
- data/lib/factbook-readers/utils.rb +47 -0
- data/lib/factbook-readers/utils_info.rb +129 -0
- data/lib/factbook-readers/version.rb +24 -0
- data/lib/factbook/readers.rb +5 -0
- data/test/data/au.html +579 -0
- data/test/data/au.yml +8 -0
- data/test/data/be.html +596 -0
- data/test/data/be.yml +8 -0
- data/test/data/json/au.json +892 -0
- data/test/data/src/ag.html +716 -0
- data/test/data/src/au-2015-09-24.html +2006 -0
- data/test/data/src/au.html +658 -0
- data/test/data/src/be-2015-09-24.html +2011 -0
- data/test/data/src/be.html +648 -0
- data/test/helper.rb +11 -0
- data/test/test_attribs.rb +87 -0
- data/test/test_attribs_def.rb +20 -0
- data/test/test_builder.rb +35 -0
- data/test/test_codes.rb +76 -0
- data/test/test_comparisons.rb +19 -0
- data/test/test_convert.rb +30 -0
- data/test/test_counter.rb +31 -0
- data/test/test_fields.rb +52 -0
- data/test/test_importer.rb +56 -0
- data/test/test_item_builder.rb +99 -0
- data/test/test_json.rb +45 -0
- data/test/test_json_builder.rb +25 -0
- data/test/test_normalize.rb +23 -0
- data/test/test_page.rb +38 -0
- data/test/test_sanitizer.rb +39 -0
- data/test/test_sanitizer_regex.rb +89 -0
- metadata +196 -0
@@ -0,0 +1,51 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
module Factbook
|
4
|
+
|
5
|
+
|
6
|
+
class JsonPageReader
|
7
|
+
def initialize( json_dir )
|
8
|
+
@json_dir = json_dir
|
9
|
+
end
|
10
|
+
|
11
|
+
def read_page( code )
|
12
|
+
path = "#{@json_dir}/#{region_to_slug(code.region)}/#{code.code}.json"
|
13
|
+
|
14
|
+
puts "reading #{code.code} #{code.name} (#{code.region}) [#{path}]..."
|
15
|
+
json = File.read( path )
|
16
|
+
|
17
|
+
## todo/fix/quick hack: for now until we have a proper header/meta/info section in json
|
18
|
+
# add some page info from code struct
|
19
|
+
|
20
|
+
info = PageInfo.new
|
21
|
+
info.country_code = code.code
|
22
|
+
info.country_name = code.name
|
23
|
+
info.region_name = code.region
|
24
|
+
|
25
|
+
page = Page.new( code.code, json: json, info: info )
|
26
|
+
page
|
27
|
+
end
|
28
|
+
|
29
|
+
def read_pages( codes, limit: nil )
|
30
|
+
pages = []
|
31
|
+
i=0
|
32
|
+
codes.each do |code|
|
33
|
+
next if limit && i > limit ## for debugging just process first x entries
|
34
|
+
|
35
|
+
pages << read_page( code )
|
36
|
+
end
|
37
|
+
pages
|
38
|
+
end
|
39
|
+
|
40
|
+
private
|
41
|
+
def region_to_slug( text )
|
42
|
+
## change and => n
|
43
|
+
## change & => n
|
44
|
+
## change all spaces to => -
|
45
|
+
## e.g. East & Southeast Asia => east-n-southeast-asia
|
46
|
+
## Central America and Caribbean => central-america-n-caribbean
|
47
|
+
text.downcase.gsub('and', 'n').gsub( '&', 'n' ).gsub( ' ', '-' )
|
48
|
+
end
|
49
|
+
end ## JsonPageReader
|
50
|
+
|
51
|
+
end # module Factbook
|
@@ -0,0 +1,307 @@
|
|
1
|
+
|
2
|
+
module Factbook
|
3
|
+
|
4
|
+
class Sanitizer
|
5
|
+
include LogUtils::Logging
|
6
|
+
include Utils ## e.g. find_page_info etc.
|
7
|
+
|
8
|
+
def sanitize( html )
|
9
|
+
## todo: add option for (html source) encoding - why?? why not??
|
10
|
+
|
11
|
+
## note:
|
12
|
+
## returns 1) html profile withouth headers, footers, scripts,etc.
|
13
|
+
## 2) page (meta) info e.g. country_name, country_code, last_updated, etc.
|
14
|
+
## 3) errors e.g. list of errors e.g. endcoding errors (invalid byte sequence etc.)
|
15
|
+
|
16
|
+
page_info = PageInfo.new
|
17
|
+
|
18
|
+
## todo:
|
19
|
+
## make page info optional? why? why not?
|
20
|
+
## not always available (if page structure changes) - check
|
21
|
+
## what page info is required??
|
22
|
+
h = find_page_info( html )
|
23
|
+
if h
|
24
|
+
page_info.country_code = h[:country_code]
|
25
|
+
page_info.country_name = h[:country_name]
|
26
|
+
page_info.country_affiliation = h[:country_affiliation]
|
27
|
+
page_info.region_code = h[:region_code]
|
28
|
+
page_info.region_name = h[:region_name]
|
29
|
+
else
|
30
|
+
page_info.country_code = find_country_code( html )
|
31
|
+
## print/warn: no page info found
|
32
|
+
end
|
33
|
+
|
34
|
+
|
35
|
+
page_info.last_updated = find_page_last_updated( html )
|
36
|
+
|
37
|
+
|
38
|
+
html_profile = find_country_profile( html ) ## cut-off headers, footers, scripts, etc.
|
39
|
+
|
40
|
+
## todo/check: remove 3rd args old errors array - why? why not?
|
41
|
+
[html_profile, page_info, []]
|
42
|
+
end
|
43
|
+
|
44
|
+
|
45
|
+
|
46
|
+
def find_country_profile( html )
|
47
|
+
####
|
48
|
+
## remove header (everything before)
|
49
|
+
## <ul class="expandcollapse">
|
50
|
+
|
51
|
+
##
|
52
|
+
## fix know broken html bugs
|
53
|
+
## in co (Columbia) page (Nov/11 2020):
|
54
|
+
## <div class="photogallery_captiontext">
|
55
|
+
## <p>slightly less than twice the size of Texas</p
|
56
|
+
## </div>
|
57
|
+
## note: </p => unclosed p!! change to </p>
|
58
|
+
|
59
|
+
## note: in regex use negative looakhead e.g. (?!patttern)
|
60
|
+
html = html.gsub( %r{</p(?![>])} ) do |m|
|
61
|
+
puts "!! WARN: fixing unclosed </p => </p>"
|
62
|
+
puts "#{m}"
|
63
|
+
'</p>'
|
64
|
+
end
|
65
|
+
|
66
|
+
|
67
|
+
## note: replace all non-breaking spaces with spaces for now
|
68
|
+
## see fr (france) in political parties section for example
|
69
|
+
html = html.gsub( " ", ' ' )
|
70
|
+
|
71
|
+
|
72
|
+
|
73
|
+
doc = Nokogiri::HTML( html )
|
74
|
+
|
75
|
+
ul = doc.css( 'ul.expandcollapse' )[0]
|
76
|
+
|
77
|
+
puts ul.to_html[0..100]
|
78
|
+
|
79
|
+
|
80
|
+
|
81
|
+
## note: special case cc uses h2 instead of div block
|
82
|
+
## <h2 class="question cam_med" sectiontitle="Introduction" ccode="cc"
|
83
|
+
## style="border-bottom: 2px solid white; cursor: pointer;">
|
84
|
+
## Introduction :: <span class="region">CURACAO </span>
|
85
|
+
## </h2>
|
86
|
+
## is old format !!!!
|
87
|
+
## cc - CURACAO
|
88
|
+
## http headers says - last-modified: Wed, 14 Nov 2018 14:09:28 GMT
|
89
|
+
## page says - PAGE LAST UPDATED ON MARCH 14, 2018
|
90
|
+
## wait for new version to be generated / pushed!!!
|
91
|
+
|
92
|
+
## check for old format if h2 are present
|
93
|
+
h2s = ul.css( 'h2' )
|
94
|
+
if h2s.size > 0
|
95
|
+
puts " !! WARN: found #{h2s.size} h2(s) - assume old format - sorry - must wait for update!!!"
|
96
|
+
## return empty html string - why? why not?
|
97
|
+
return ''
|
98
|
+
end
|
99
|
+
|
100
|
+
|
101
|
+
###
|
102
|
+
## sanitize
|
103
|
+
|
104
|
+
## remove link items
|
105
|
+
## assume two <li>s are a section
|
106
|
+
|
107
|
+
html = String.new('')
|
108
|
+
|
109
|
+
## filter all li's
|
110
|
+
ul_children = ul.children.select { |el| if el.name == 'li'
|
111
|
+
true
|
112
|
+
else
|
113
|
+
# puts "skipping #{el.name} >#{el.to_html}<"
|
114
|
+
false
|
115
|
+
end
|
116
|
+
}
|
117
|
+
## ul_children = ul.css( 'li' )
|
118
|
+
|
119
|
+
puts " #{ul_children.size} li(s):"
|
120
|
+
ul_children.each_slice(2) do |lis|
|
121
|
+
li = lis[0]
|
122
|
+
div = li.at( 'div[sectiontitle]' )
|
123
|
+
if div.nil?
|
124
|
+
puts "!! ERROR: no section title found in div:"
|
125
|
+
puts li.to_html
|
126
|
+
exit 1
|
127
|
+
end
|
128
|
+
|
129
|
+
section_title = div['sectiontitle'].to_s
|
130
|
+
|
131
|
+
html << "<h2>#{section_title}</h2>\n"
|
132
|
+
|
133
|
+
|
134
|
+
li = lis[1]
|
135
|
+
## filter all div's
|
136
|
+
li_children = li.children.select { |el| if el.name =='div'
|
137
|
+
true
|
138
|
+
else
|
139
|
+
# puts "skipping #{el.name} >#{el.to_html}<"
|
140
|
+
false
|
141
|
+
end
|
142
|
+
}
|
143
|
+
puts " #{li_children.size} div(s) in >#{section_title}<:"
|
144
|
+
|
145
|
+
li_children.each_slice(2) do |divs|
|
146
|
+
div = divs[0]
|
147
|
+
a = div.css('a')[0]
|
148
|
+
|
149
|
+
if a
|
150
|
+
subsection_title = a.text ## todo/check/rename: use field_name or such - why? why not?
|
151
|
+
html << "\n<h3>#{subsection_title}:</h3>\n"
|
152
|
+
else
|
153
|
+
subsection_title = '???'
|
154
|
+
puts "!! WARN: no anchor found:"
|
155
|
+
puts div.to_html
|
156
|
+
end
|
157
|
+
|
158
|
+
|
159
|
+
div = divs[1]
|
160
|
+
div_children = div.children.select {|el| el.name == 'div' ? true : false }
|
161
|
+
puts " #{div_children.size} div(s) in field >#{subsection_title}<:"
|
162
|
+
|
163
|
+
## use more robust version - only get divs with category_data
|
164
|
+
## div_children = div.css( 'div.category_data' )
|
165
|
+
## puts " #{div_children.size} div(s) in field >#{subsection_title}< v2:"
|
166
|
+
|
167
|
+
# if div_children.size > 14
|
168
|
+
# ## us labor force has 11 divs
|
169
|
+
# ## possibly an error
|
170
|
+
# puts "!! ERROR - too many category_data divs found:"
|
171
|
+
# puts div.to_html[0..200]
|
172
|
+
# puts "\n...\n"
|
173
|
+
# puts puts div.to_html[-400..-1]
|
174
|
+
# exit 1
|
175
|
+
# end
|
176
|
+
|
177
|
+
div_children.each do |catdiv|
|
178
|
+
if catdiv['class'] && catdiv['class'].index( 'category_data' )
|
179
|
+
|
180
|
+
if catdiv['class'].index( 'attachment' )
|
181
|
+
## skip attachments e.g. maps, pop pyramids, etc.
|
182
|
+
else
|
183
|
+
html << sanitize_data( catdiv, title: subsection_title )
|
184
|
+
html << "\n"
|
185
|
+
end
|
186
|
+
else
|
187
|
+
if catdiv.to_html.index( 'country comparison to the world' )
|
188
|
+
## silently skip for now country comparision
|
189
|
+
else
|
190
|
+
puts "!! ERROR: div (W/O category_data class) in >#{subsection_title}<:"
|
191
|
+
puts catdiv.to_html
|
192
|
+
exit 1
|
193
|
+
end
|
194
|
+
end
|
195
|
+
end
|
196
|
+
end
|
197
|
+
end
|
198
|
+
|
199
|
+
html
|
200
|
+
end
|
201
|
+
|
202
|
+
|
203
|
+
#
|
204
|
+
# <span class="subfield-date" aria-label="Date of information: 2018">(2018)</span>
|
205
|
+
#
|
206
|
+
# remove aria labels
|
207
|
+
ARIA_ATTR_REGEX = /\s*
|
208
|
+
aria-label=('|").+?\1 ## note: use non-greedy match e.g. .+?
|
209
|
+
/xim ## do NOT allow multi-line - why? why not?
|
210
|
+
|
211
|
+
## find double breaks e.g. <br><br>
|
212
|
+
BR_BR_REGEX = /(<br> \s* <br>)
|
213
|
+
/xim ## do NOT allow multi-line - why? why not?
|
214
|
+
|
215
|
+
|
216
|
+
def sanitize_data( el, title: )
|
217
|
+
## todo/fix/check:
|
218
|
+
## check if more than one p(aragraph)
|
219
|
+
## get squezzed together without space inbetween?
|
220
|
+
|
221
|
+
|
222
|
+
## step 0: replace all possible a(nchor) links with just inner text
|
223
|
+
el.css( 'a').each do |a|
|
224
|
+
a.replace( " #{a.text.strip} " )
|
225
|
+
end
|
226
|
+
|
227
|
+
|
228
|
+
|
229
|
+
inner_html = String.new('')
|
230
|
+
|
231
|
+
## step 1 - unwrap paragraphs if present
|
232
|
+
## and convert dom/nokogiri doc/tree to html string
|
233
|
+
p_count = 0
|
234
|
+
el.children.each do |child|
|
235
|
+
if child.name == 'p'
|
236
|
+
## puts " [debug ] unwrap <p> no.#{p_count+1}"
|
237
|
+
|
238
|
+
p_inner_html = child.inner_html.strip ## note: unwrap! use inner_html NOT to_html/html
|
239
|
+
if p_inner_html.empty?
|
240
|
+
## note: skip empty paragraphs for now
|
241
|
+
else
|
242
|
+
inner_html << ' ++ ' if p_count > 0
|
243
|
+
inner_html << p_inner_html
|
244
|
+
inner_html << " \n\n "
|
245
|
+
|
246
|
+
p_count += 1
|
247
|
+
end
|
248
|
+
else
|
249
|
+
inner_html << child.to_html
|
250
|
+
end
|
251
|
+
end
|
252
|
+
## note: keep container div!! just replace inner html!!!
|
253
|
+
## note: right strip all trailing spaces/newlines for now
|
254
|
+
## plus add back a single one for pretty printing
|
255
|
+
el.inner_html = inner_html.rstrip + "\n"
|
256
|
+
|
257
|
+
# finally - convert back to html (string)
|
258
|
+
html = el.to_html
|
259
|
+
|
260
|
+
|
261
|
+
|
262
|
+
html = html.gsub( ARIA_ATTR_REGEX ) do |m|
|
263
|
+
## do not report / keep silent for now
|
264
|
+
## puts "in >#{title}< remove aria-label attr:"
|
265
|
+
## puts "#{m}"
|
266
|
+
''
|
267
|
+
end
|
268
|
+
|
269
|
+
html = html.gsub( BR_BR_REGEX ) do |m|
|
270
|
+
puts "in >#{title}< squish two <br>s into one:"
|
271
|
+
puts "#{m}"
|
272
|
+
'<br>'
|
273
|
+
end
|
274
|
+
|
275
|
+
html = html.gsub( /<br>/i ) do |m|
|
276
|
+
puts "in >#{title}< replace <br> with inline (plain) text ++:"
|
277
|
+
puts "#{m}"
|
278
|
+
' ++ '
|
279
|
+
end
|
280
|
+
|
281
|
+
## cleanup/remove ++ before subfield e.g.
|
282
|
+
## of: ++ => of: or such
|
283
|
+
html = html.gsub( %r{
|
284
|
+
(?<=([a-z]:)|(:</span>)) # note: use zero-length positive lookbehind
|
285
|
+
\s+
|
286
|
+
\+{2}
|
287
|
+
\s+}xim ) do |m|
|
288
|
+
puts "in >#{title} remove ++ before <field>: marker:"
|
289
|
+
puts "#{m}"
|
290
|
+
' '
|
291
|
+
end
|
292
|
+
|
293
|
+
#####
|
294
|
+
# "unfancy" smart quotes to ascii - why? why not?
|
295
|
+
# e.g.
|
296
|
+
# Following Britain’s victory => Following Britain's victory
|
297
|
+
html = html.tr( "’", "'" )
|
298
|
+
|
299
|
+
|
300
|
+
html
|
301
|
+
end
|
302
|
+
|
303
|
+
|
304
|
+
|
305
|
+
end # class Sanitizer
|
306
|
+
|
307
|
+
end # module Factbook
|
@@ -0,0 +1,29 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
module Factbook
|
4
|
+
|
5
|
+
|
6
|
+
class Sect
|
7
|
+
include LogUtils::Logging
|
8
|
+
|
9
|
+
attr_accessor :title ## use name instead of title - why? why not?
|
10
|
+
attr_accessor :subsects
|
11
|
+
|
12
|
+
def initialize
|
13
|
+
@subsects = []
|
14
|
+
end
|
15
|
+
|
16
|
+
def data
|
17
|
+
## convert sects to hash
|
18
|
+
@data = {}
|
19
|
+
|
20
|
+
subsects.each_with_index do |subsect,i|
|
21
|
+
@data[ subsect.title ] = subsect.data
|
22
|
+
end
|
23
|
+
@data
|
24
|
+
end
|
25
|
+
|
26
|
+
|
27
|
+
end # class Sect
|
28
|
+
|
29
|
+
end # module Factbook
|
@@ -0,0 +1,18 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
module Factbook
|
4
|
+
|
5
|
+
|
6
|
+
class Subsect
|
7
|
+
include LogUtils::Logging
|
8
|
+
|
9
|
+
attr_accessor :title ## use name instead of title - why? why not?
|
10
|
+
attr_accessor :data ## hash holding data e.g. { 'text' => '...' etc. }
|
11
|
+
|
12
|
+
def initialize
|
13
|
+
@data = {}
|
14
|
+
end
|
15
|
+
|
16
|
+
end # class Subsect
|
17
|
+
|
18
|
+
end # module Factbook
|
@@ -0,0 +1,52 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
module Factbook
|
4
|
+
|
5
|
+
##
|
6
|
+
## make more "generic" - why? why not?
|
7
|
+
## (re)use for other files ?? move to textutils ??
|
8
|
+
|
9
|
+
##
|
10
|
+
## for now reads in rows with values separated by at least 3+ spaces e.g.:
|
11
|
+
## see www.cia.gov/library/publications/the-world-factbook/rankorder/rawdata_2119.txt
|
12
|
+
## 1 China 1,367,485,388
|
13
|
+
## 2 India 1,251,695,584
|
14
|
+
## 3 European Union 513,949,445
|
15
|
+
## 4 United States 321,368,864
|
16
|
+
## 5 Indonesia 255,993,674
|
17
|
+
## 6 Brazil 204,259,812
|
18
|
+
|
19
|
+
|
20
|
+
class TableReader
|
21
|
+
include LogUtils::Logging
|
22
|
+
|
23
|
+
|
24
|
+
def initialize( text )
|
25
|
+
@text = text
|
26
|
+
end
|
27
|
+
|
28
|
+
def read
|
29
|
+
recs = []
|
30
|
+
|
31
|
+
line_no = 0
|
32
|
+
@text.each_line do |line|
|
33
|
+
line_no +=1
|
34
|
+
line = line.strip ## remove leading and trailing whitespace
|
35
|
+
if line.empty?
|
36
|
+
puts "** skipping empty line #{line_no}"
|
37
|
+
next
|
38
|
+
end
|
39
|
+
|
40
|
+
values = line.split( /[ ]{3,}/ ) ## split three or more spaces - use just two ?? why? why not??
|
41
|
+
|
42
|
+
## puts line
|
43
|
+
## pp values
|
44
|
+
recs << values
|
45
|
+
end
|
46
|
+
recs
|
47
|
+
end
|
48
|
+
|
49
|
+
|
50
|
+
end # class TableReader
|
51
|
+
|
52
|
+
end # module Factbook
|