factbook-readers 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +3 -0
- data/Manifest.txt +56 -0
- data/README.md +196 -0
- data/Rakefile +34 -0
- data/data/attributes.yml +337 -0
- data/data/categories.csv +164 -0
- data/data/codes.csv +262 -0
- data/data/codesxref.csv +280 -0
- data/data/comparisons.csv +75 -0
- data/lib/factbook-readers.rb +59 -0
- data/lib/factbook-readers/attributes.rb +74 -0
- data/lib/factbook-readers/builder.rb +212 -0
- data/lib/factbook-readers/builder_item.rb +185 -0
- data/lib/factbook-readers/builder_json.rb +79 -0
- data/lib/factbook-readers/codes.rb +122 -0
- data/lib/factbook-readers/comparisons.rb +50 -0
- data/lib/factbook-readers/counter.rb +48 -0
- data/lib/factbook-readers/normalize.rb +43 -0
- data/lib/factbook-readers/page.rb +148 -0
- data/lib/factbook-readers/page_info.rb +12 -0
- data/lib/factbook-readers/reader_json.rb +51 -0
- data/lib/factbook-readers/sanitizer.rb +307 -0
- data/lib/factbook-readers/sect.rb +29 -0
- data/lib/factbook-readers/subsect.rb +18 -0
- data/lib/factbook-readers/table.rb +52 -0
- data/lib/factbook-readers/utils.rb +47 -0
- data/lib/factbook-readers/utils_info.rb +129 -0
- data/lib/factbook-readers/version.rb +24 -0
- data/lib/factbook/readers.rb +5 -0
- data/test/data/au.html +579 -0
- data/test/data/au.yml +8 -0
- data/test/data/be.html +596 -0
- data/test/data/be.yml +8 -0
- data/test/data/json/au.json +892 -0
- data/test/data/src/ag.html +716 -0
- data/test/data/src/au-2015-09-24.html +2006 -0
- data/test/data/src/au.html +658 -0
- data/test/data/src/be-2015-09-24.html +2011 -0
- data/test/data/src/be.html +648 -0
- data/test/helper.rb +11 -0
- data/test/test_attribs.rb +87 -0
- data/test/test_attribs_def.rb +20 -0
- data/test/test_builder.rb +35 -0
- data/test/test_codes.rb +76 -0
- data/test/test_comparisons.rb +19 -0
- data/test/test_convert.rb +30 -0
- data/test/test_counter.rb +31 -0
- data/test/test_fields.rb +52 -0
- data/test/test_importer.rb +56 -0
- data/test/test_item_builder.rb +99 -0
- data/test/test_json.rb +45 -0
- data/test/test_json_builder.rb +25 -0
- data/test/test_normalize.rb +23 -0
- data/test/test_page.rb +38 -0
- data/test/test_sanitizer.rb +39 -0
- data/test/test_sanitizer_regex.rb +89 -0
- metadata +196 -0
@@ -0,0 +1,51 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
module Factbook
|
4
|
+
|
5
|
+
|
6
|
+
class JsonPageReader
|
7
|
+
def initialize( json_dir )
|
8
|
+
@json_dir = json_dir
|
9
|
+
end
|
10
|
+
|
11
|
+
def read_page( code )
|
12
|
+
path = "#{@json_dir}/#{region_to_slug(code.region)}/#{code.code}.json"
|
13
|
+
|
14
|
+
puts "reading #{code.code} #{code.name} (#{code.region}) [#{path}]..."
|
15
|
+
json = File.read( path )
|
16
|
+
|
17
|
+
## todo/fix/quick hack: for now until we have a proper header/meta/info section in json
|
18
|
+
# add some page info from code struct
|
19
|
+
|
20
|
+
info = PageInfo.new
|
21
|
+
info.country_code = code.code
|
22
|
+
info.country_name = code.name
|
23
|
+
info.region_name = code.region
|
24
|
+
|
25
|
+
page = Page.new( code.code, json: json, info: info )
|
26
|
+
page
|
27
|
+
end
|
28
|
+
|
29
|
+
def read_pages( codes, limit: nil )
|
30
|
+
pages = []
|
31
|
+
i=0
|
32
|
+
codes.each do |code|
|
33
|
+
next if limit && i > limit ## for debugging just process first x entries
|
34
|
+
|
35
|
+
pages << read_page( code )
|
36
|
+
end
|
37
|
+
pages
|
38
|
+
end
|
39
|
+
|
40
|
+
private
|
41
|
+
def region_to_slug( text )
|
42
|
+
## change and => n
|
43
|
+
## change & => n
|
44
|
+
## change all spaces to => -
|
45
|
+
## e.g. East & Southeast Asia => east-n-southeast-asia
|
46
|
+
## Central America and Caribbean => central-america-n-caribbean
|
47
|
+
text.downcase.gsub('and', 'n').gsub( '&', 'n' ).gsub( ' ', '-' )
|
48
|
+
end
|
49
|
+
end ## JsonPageReader
|
50
|
+
|
51
|
+
end # module Factbook
|
@@ -0,0 +1,307 @@
|
|
1
|
+
|
2
|
+
module Factbook
|
3
|
+
|
4
|
+
class Sanitizer
|
5
|
+
include LogUtils::Logging
|
6
|
+
include Utils ## e.g. find_page_info etc.
|
7
|
+
|
8
|
+
def sanitize( html )
|
9
|
+
## todo: add option for (html source) encoding - why?? why not??
|
10
|
+
|
11
|
+
## note:
|
12
|
+
## returns 1) html profile withouth headers, footers, scripts,etc.
|
13
|
+
## 2) page (meta) info e.g. country_name, country_code, last_updated, etc.
|
14
|
+
## 3) errors e.g. list of errors e.g. endcoding errors (invalid byte sequence etc.)
|
15
|
+
|
16
|
+
page_info = PageInfo.new
|
17
|
+
|
18
|
+
## todo:
|
19
|
+
## make page info optional? why? why not?
|
20
|
+
## not always available (if page structure changes) - check
|
21
|
+
## what page info is required??
|
22
|
+
h = find_page_info( html )
|
23
|
+
if h
|
24
|
+
page_info.country_code = h[:country_code]
|
25
|
+
page_info.country_name = h[:country_name]
|
26
|
+
page_info.country_affiliation = h[:country_affiliation]
|
27
|
+
page_info.region_code = h[:region_code]
|
28
|
+
page_info.region_name = h[:region_name]
|
29
|
+
else
|
30
|
+
page_info.country_code = find_country_code( html )
|
31
|
+
## print/warn: no page info found
|
32
|
+
end
|
33
|
+
|
34
|
+
|
35
|
+
page_info.last_updated = find_page_last_updated( html )
|
36
|
+
|
37
|
+
|
38
|
+
html_profile = find_country_profile( html ) ## cut-off headers, footers, scripts, etc.
|
39
|
+
|
40
|
+
## todo/check: remove 3rd args old errors array - why? why not?
|
41
|
+
[html_profile, page_info, []]
|
42
|
+
end
|
43
|
+
|
44
|
+
|
45
|
+
|
46
|
+
def find_country_profile( html )
|
47
|
+
####
|
48
|
+
## remove header (everything before)
|
49
|
+
## <ul class="expandcollapse">
|
50
|
+
|
51
|
+
##
|
52
|
+
## fix know broken html bugs
|
53
|
+
## in co (Columbia) page (Nov/11 2020):
|
54
|
+
## <div class="photogallery_captiontext">
|
55
|
+
## <p>slightly less than twice the size of Texas</p
|
56
|
+
## </div>
|
57
|
+
## note: </p => unclosed p!! change to </p>
|
58
|
+
|
59
|
+
## note: in regex use negative looakhead e.g. (?!patttern)
|
60
|
+
html = html.gsub( %r{</p(?![>])} ) do |m|
|
61
|
+
puts "!! WARN: fixing unclosed </p => </p>"
|
62
|
+
puts "#{m}"
|
63
|
+
'</p>'
|
64
|
+
end
|
65
|
+
|
66
|
+
|
67
|
+
## note: replace all non-breaking spaces with spaces for now
|
68
|
+
## see fr (france) in political parties section for example
|
69
|
+
html = html.gsub( " ", ' ' )
|
70
|
+
|
71
|
+
|
72
|
+
|
73
|
+
doc = Nokogiri::HTML( html )
|
74
|
+
|
75
|
+
ul = doc.css( 'ul.expandcollapse' )[0]
|
76
|
+
|
77
|
+
puts ul.to_html[0..100]
|
78
|
+
|
79
|
+
|
80
|
+
|
81
|
+
## note: special case cc uses h2 instead of div block
|
82
|
+
## <h2 class="question cam_med" sectiontitle="Introduction" ccode="cc"
|
83
|
+
## style="border-bottom: 2px solid white; cursor: pointer;">
|
84
|
+
## Introduction :: <span class="region">CURACAO </span>
|
85
|
+
## </h2>
|
86
|
+
## is old format !!!!
|
87
|
+
## cc - CURACAO
|
88
|
+
## http headers says - last-modified: Wed, 14 Nov 2018 14:09:28 GMT
|
89
|
+
## page says - PAGE LAST UPDATED ON MARCH 14, 2018
|
90
|
+
## wait for new version to be generated / pushed!!!
|
91
|
+
|
92
|
+
## check for old format if h2 are present
|
93
|
+
h2s = ul.css( 'h2' )
|
94
|
+
if h2s.size > 0
|
95
|
+
puts " !! WARN: found #{h2s.size} h2(s) - assume old format - sorry - must wait for update!!!"
|
96
|
+
## return empty html string - why? why not?
|
97
|
+
return ''
|
98
|
+
end
|
99
|
+
|
100
|
+
|
101
|
+
###
|
102
|
+
## sanitize
|
103
|
+
|
104
|
+
## remove link items
|
105
|
+
## assume two <li>s are a section
|
106
|
+
|
107
|
+
html = String.new('')
|
108
|
+
|
109
|
+
## filter all li's
|
110
|
+
ul_children = ul.children.select { |el| if el.name == 'li'
|
111
|
+
true
|
112
|
+
else
|
113
|
+
# puts "skipping #{el.name} >#{el.to_html}<"
|
114
|
+
false
|
115
|
+
end
|
116
|
+
}
|
117
|
+
## ul_children = ul.css( 'li' )
|
118
|
+
|
119
|
+
puts " #{ul_children.size} li(s):"
|
120
|
+
ul_children.each_slice(2) do |lis|
|
121
|
+
li = lis[0]
|
122
|
+
div = li.at( 'div[sectiontitle]' )
|
123
|
+
if div.nil?
|
124
|
+
puts "!! ERROR: no section title found in div:"
|
125
|
+
puts li.to_html
|
126
|
+
exit 1
|
127
|
+
end
|
128
|
+
|
129
|
+
section_title = div['sectiontitle'].to_s
|
130
|
+
|
131
|
+
html << "<h2>#{section_title}</h2>\n"
|
132
|
+
|
133
|
+
|
134
|
+
li = lis[1]
|
135
|
+
## filter all div's
|
136
|
+
li_children = li.children.select { |el| if el.name =='div'
|
137
|
+
true
|
138
|
+
else
|
139
|
+
# puts "skipping #{el.name} >#{el.to_html}<"
|
140
|
+
false
|
141
|
+
end
|
142
|
+
}
|
143
|
+
puts " #{li_children.size} div(s) in >#{section_title}<:"
|
144
|
+
|
145
|
+
li_children.each_slice(2) do |divs|
|
146
|
+
div = divs[0]
|
147
|
+
a = div.css('a')[0]
|
148
|
+
|
149
|
+
if a
|
150
|
+
subsection_title = a.text ## todo/check/rename: use field_name or such - why? why not?
|
151
|
+
html << "\n<h3>#{subsection_title}:</h3>\n"
|
152
|
+
else
|
153
|
+
subsection_title = '???'
|
154
|
+
puts "!! WARN: no anchor found:"
|
155
|
+
puts div.to_html
|
156
|
+
end
|
157
|
+
|
158
|
+
|
159
|
+
div = divs[1]
|
160
|
+
div_children = div.children.select {|el| el.name == 'div' ? true : false }
|
161
|
+
puts " #{div_children.size} div(s) in field >#{subsection_title}<:"
|
162
|
+
|
163
|
+
## use more robust version - only get divs with category_data
|
164
|
+
## div_children = div.css( 'div.category_data' )
|
165
|
+
## puts " #{div_children.size} div(s) in field >#{subsection_title}< v2:"
|
166
|
+
|
167
|
+
# if div_children.size > 14
|
168
|
+
# ## us labor force has 11 divs
|
169
|
+
# ## possibly an error
|
170
|
+
# puts "!! ERROR - too many category_data divs found:"
|
171
|
+
# puts div.to_html[0..200]
|
172
|
+
# puts "\n...\n"
|
173
|
+
# puts puts div.to_html[-400..-1]
|
174
|
+
# exit 1
|
175
|
+
# end
|
176
|
+
|
177
|
+
div_children.each do |catdiv|
|
178
|
+
if catdiv['class'] && catdiv['class'].index( 'category_data' )
|
179
|
+
|
180
|
+
if catdiv['class'].index( 'attachment' )
|
181
|
+
## skip attachments e.g. maps, pop pyramids, etc.
|
182
|
+
else
|
183
|
+
html << sanitize_data( catdiv, title: subsection_title )
|
184
|
+
html << "\n"
|
185
|
+
end
|
186
|
+
else
|
187
|
+
if catdiv.to_html.index( 'country comparison to the world' )
|
188
|
+
## silently skip for now country comparision
|
189
|
+
else
|
190
|
+
puts "!! ERROR: div (W/O category_data class) in >#{subsection_title}<:"
|
191
|
+
puts catdiv.to_html
|
192
|
+
exit 1
|
193
|
+
end
|
194
|
+
end
|
195
|
+
end
|
196
|
+
end
|
197
|
+
end
|
198
|
+
|
199
|
+
html
|
200
|
+
end
|
201
|
+
|
202
|
+
|
203
|
+
#
|
204
|
+
# <span class="subfield-date" aria-label="Date of information: 2018">(2018)</span>
|
205
|
+
#
|
206
|
+
# remove aria labels
|
207
|
+
ARIA_ATTR_REGEX = /\s*
|
208
|
+
aria-label=('|").+?\1 ## note: use non-greedy match e.g. .+?
|
209
|
+
/xim ## do NOT allow multi-line - why? why not?
|
210
|
+
|
211
|
+
## find double breaks e.g. <br><br>
|
212
|
+
BR_BR_REGEX = /(<br> \s* <br>)
|
213
|
+
/xim ## do NOT allow multi-line - why? why not?
|
214
|
+
|
215
|
+
|
216
|
+
def sanitize_data( el, title: )
|
217
|
+
## todo/fix/check:
|
218
|
+
## check if more than one p(aragraph)
|
219
|
+
## get squezzed together without space inbetween?
|
220
|
+
|
221
|
+
|
222
|
+
## step 0: replace all possible a(nchor) links with just inner text
|
223
|
+
el.css( 'a').each do |a|
|
224
|
+
a.replace( " #{a.text.strip} " )
|
225
|
+
end
|
226
|
+
|
227
|
+
|
228
|
+
|
229
|
+
inner_html = String.new('')
|
230
|
+
|
231
|
+
## step 1 - unwrap paragraphs if present
|
232
|
+
## and convert dom/nokogiri doc/tree to html string
|
233
|
+
p_count = 0
|
234
|
+
el.children.each do |child|
|
235
|
+
if child.name == 'p'
|
236
|
+
## puts " [debug ] unwrap <p> no.#{p_count+1}"
|
237
|
+
|
238
|
+
p_inner_html = child.inner_html.strip ## note: unwrap! use inner_html NOT to_html/html
|
239
|
+
if p_inner_html.empty?
|
240
|
+
## note: skip empty paragraphs for now
|
241
|
+
else
|
242
|
+
inner_html << ' ++ ' if p_count > 0
|
243
|
+
inner_html << p_inner_html
|
244
|
+
inner_html << " \n\n "
|
245
|
+
|
246
|
+
p_count += 1
|
247
|
+
end
|
248
|
+
else
|
249
|
+
inner_html << child.to_html
|
250
|
+
end
|
251
|
+
end
|
252
|
+
## note: keep container div!! just replace inner html!!!
|
253
|
+
## note: right strip all trailing spaces/newlines for now
|
254
|
+
## plus add back a single one for pretty printing
|
255
|
+
el.inner_html = inner_html.rstrip + "\n"
|
256
|
+
|
257
|
+
# finally - convert back to html (string)
|
258
|
+
html = el.to_html
|
259
|
+
|
260
|
+
|
261
|
+
|
262
|
+
html = html.gsub( ARIA_ATTR_REGEX ) do |m|
|
263
|
+
## do not report / keep silent for now
|
264
|
+
## puts "in >#{title}< remove aria-label attr:"
|
265
|
+
## puts "#{m}"
|
266
|
+
''
|
267
|
+
end
|
268
|
+
|
269
|
+
html = html.gsub( BR_BR_REGEX ) do |m|
|
270
|
+
puts "in >#{title}< squish two <br>s into one:"
|
271
|
+
puts "#{m}"
|
272
|
+
'<br>'
|
273
|
+
end
|
274
|
+
|
275
|
+
html = html.gsub( /<br>/i ) do |m|
|
276
|
+
puts "in >#{title}< replace <br> with inline (plain) text ++:"
|
277
|
+
puts "#{m}"
|
278
|
+
' ++ '
|
279
|
+
end
|
280
|
+
|
281
|
+
## cleanup/remove ++ before subfield e.g.
|
282
|
+
## of: ++ => of: or such
|
283
|
+
html = html.gsub( %r{
|
284
|
+
(?<=([a-z]:)|(:</span>)) # note: use zero-length positive lookbehind
|
285
|
+
\s+
|
286
|
+
\+{2}
|
287
|
+
\s+}xim ) do |m|
|
288
|
+
puts "in >#{title} remove ++ before <field>: marker:"
|
289
|
+
puts "#{m}"
|
290
|
+
' '
|
291
|
+
end
|
292
|
+
|
293
|
+
#####
|
294
|
+
# "unfancy" smart quotes to ascii - why? why not?
|
295
|
+
# e.g.
|
296
|
+
# Following Britain’s victory => Following Britain's victory
|
297
|
+
html = html.tr( "’", "'" )
|
298
|
+
|
299
|
+
|
300
|
+
html
|
301
|
+
end
|
302
|
+
|
303
|
+
|
304
|
+
|
305
|
+
end # class Sanitizer
|
306
|
+
|
307
|
+
end # module Factbook
|
@@ -0,0 +1,29 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
module Factbook
|
4
|
+
|
5
|
+
|
6
|
+
class Sect
|
7
|
+
include LogUtils::Logging
|
8
|
+
|
9
|
+
attr_accessor :title ## use name instead of title - why? why not?
|
10
|
+
attr_accessor :subsects
|
11
|
+
|
12
|
+
def initialize
|
13
|
+
@subsects = []
|
14
|
+
end
|
15
|
+
|
16
|
+
def data
|
17
|
+
## convert sects to hash
|
18
|
+
@data = {}
|
19
|
+
|
20
|
+
subsects.each_with_index do |subsect,i|
|
21
|
+
@data[ subsect.title ] = subsect.data
|
22
|
+
end
|
23
|
+
@data
|
24
|
+
end
|
25
|
+
|
26
|
+
|
27
|
+
end # class Sect
|
28
|
+
|
29
|
+
end # module Factbook
|
@@ -0,0 +1,18 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
module Factbook
|
4
|
+
|
5
|
+
|
6
|
+
class Subsect
|
7
|
+
include LogUtils::Logging
|
8
|
+
|
9
|
+
attr_accessor :title ## use name instead of title - why? why not?
|
10
|
+
attr_accessor :data ## hash holding data e.g. { 'text' => '...' etc. }
|
11
|
+
|
12
|
+
def initialize
|
13
|
+
@data = {}
|
14
|
+
end
|
15
|
+
|
16
|
+
end # class Subsect
|
17
|
+
|
18
|
+
end # module Factbook
|
@@ -0,0 +1,52 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
module Factbook
|
4
|
+
|
5
|
+
##
|
6
|
+
## make more "generic" - why? why not?
|
7
|
+
## (re)use for other files ?? move to textutils ??
|
8
|
+
|
9
|
+
##
|
10
|
+
## for now reads in rows with values separated by at least 3+ spaces e.g.:
|
11
|
+
## see www.cia.gov/library/publications/the-world-factbook/rankorder/rawdata_2119.txt
|
12
|
+
## 1 China 1,367,485,388
|
13
|
+
## 2 India 1,251,695,584
|
14
|
+
## 3 European Union 513,949,445
|
15
|
+
## 4 United States 321,368,864
|
16
|
+
## 5 Indonesia 255,993,674
|
17
|
+
## 6 Brazil 204,259,812
|
18
|
+
|
19
|
+
|
20
|
+
class TableReader
|
21
|
+
include LogUtils::Logging
|
22
|
+
|
23
|
+
|
24
|
+
def initialize( text )
|
25
|
+
@text = text
|
26
|
+
end
|
27
|
+
|
28
|
+
def read
|
29
|
+
recs = []
|
30
|
+
|
31
|
+
line_no = 0
|
32
|
+
@text.each_line do |line|
|
33
|
+
line_no +=1
|
34
|
+
line = line.strip ## remove leading and trailing whitespace
|
35
|
+
if line.empty?
|
36
|
+
puts "** skipping empty line #{line_no}"
|
37
|
+
next
|
38
|
+
end
|
39
|
+
|
40
|
+
values = line.split( /[ ]{3,}/ ) ## split three or more spaces - use just two ?? why? why not??
|
41
|
+
|
42
|
+
## puts line
|
43
|
+
## pp values
|
44
|
+
recs << values
|
45
|
+
end
|
46
|
+
recs
|
47
|
+
end
|
48
|
+
|
49
|
+
|
50
|
+
end # class TableReader
|
51
|
+
|
52
|
+
end # module Factbook
|