factbook-readers 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +3 -0
  3. data/Manifest.txt +56 -0
  4. data/README.md +196 -0
  5. data/Rakefile +34 -0
  6. data/data/attributes.yml +337 -0
  7. data/data/categories.csv +164 -0
  8. data/data/codes.csv +262 -0
  9. data/data/codesxref.csv +280 -0
  10. data/data/comparisons.csv +75 -0
  11. data/lib/factbook-readers.rb +59 -0
  12. data/lib/factbook-readers/attributes.rb +74 -0
  13. data/lib/factbook-readers/builder.rb +212 -0
  14. data/lib/factbook-readers/builder_item.rb +185 -0
  15. data/lib/factbook-readers/builder_json.rb +79 -0
  16. data/lib/factbook-readers/codes.rb +122 -0
  17. data/lib/factbook-readers/comparisons.rb +50 -0
  18. data/lib/factbook-readers/counter.rb +48 -0
  19. data/lib/factbook-readers/normalize.rb +43 -0
  20. data/lib/factbook-readers/page.rb +148 -0
  21. data/lib/factbook-readers/page_info.rb +12 -0
  22. data/lib/factbook-readers/reader_json.rb +51 -0
  23. data/lib/factbook-readers/sanitizer.rb +307 -0
  24. data/lib/factbook-readers/sect.rb +29 -0
  25. data/lib/factbook-readers/subsect.rb +18 -0
  26. data/lib/factbook-readers/table.rb +52 -0
  27. data/lib/factbook-readers/utils.rb +47 -0
  28. data/lib/factbook-readers/utils_info.rb +129 -0
  29. data/lib/factbook-readers/version.rb +24 -0
  30. data/lib/factbook/readers.rb +5 -0
  31. data/test/data/au.html +579 -0
  32. data/test/data/au.yml +8 -0
  33. data/test/data/be.html +596 -0
  34. data/test/data/be.yml +8 -0
  35. data/test/data/json/au.json +892 -0
  36. data/test/data/src/ag.html +716 -0
  37. data/test/data/src/au-2015-09-24.html +2006 -0
  38. data/test/data/src/au.html +658 -0
  39. data/test/data/src/be-2015-09-24.html +2011 -0
  40. data/test/data/src/be.html +648 -0
  41. data/test/helper.rb +11 -0
  42. data/test/test_attribs.rb +87 -0
  43. data/test/test_attribs_def.rb +20 -0
  44. data/test/test_builder.rb +35 -0
  45. data/test/test_codes.rb +76 -0
  46. data/test/test_comparisons.rb +19 -0
  47. data/test/test_convert.rb +30 -0
  48. data/test/test_counter.rb +31 -0
  49. data/test/test_fields.rb +52 -0
  50. data/test/test_importer.rb +56 -0
  51. data/test/test_item_builder.rb +99 -0
  52. data/test/test_json.rb +45 -0
  53. data/test/test_json_builder.rb +25 -0
  54. data/test/test_normalize.rb +23 -0
  55. data/test/test_page.rb +38 -0
  56. data/test/test_sanitizer.rb +39 -0
  57. data/test/test_sanitizer_regex.rb +89 -0
  58. metadata +196 -0
@@ -0,0 +1,12 @@
1
+ # encoding: utf-8
2
+
3
+ module Factbook
4
+
5
+ PageInfo = Struct.new( :country_code,
6
+ :country_name,
7
+ :country_affiliation,
8
+ :region_code,
9
+ :region_name,
10
+ :last_updated )
11
+
12
+ end # module Factbook
@@ -0,0 +1,51 @@
1
+ # encoding: utf-8
2
+
3
+ module Factbook
4
+
5
+
6
+ class JsonPageReader
7
+ def initialize( json_dir )
8
+ @json_dir = json_dir
9
+ end
10
+
11
+ def read_page( code )
12
+ path = "#{@json_dir}/#{region_to_slug(code.region)}/#{code.code}.json"
13
+
14
+ puts "reading #{code.code} #{code.name} (#{code.region}) [#{path}]..."
15
+ json = File.read( path )
16
+
17
+ ## todo/fix/quick hack: for now until we have a proper header/meta/info section in json
18
+ # add some page info from code struct
19
+
20
+ info = PageInfo.new
21
+ info.country_code = code.code
22
+ info.country_name = code.name
23
+ info.region_name = code.region
24
+
25
+ page = Page.new( code.code, json: json, info: info )
26
+ page
27
+ end
28
+
29
+ def read_pages( codes, limit: nil )
30
+ pages = []
31
+ i=0
32
+ codes.each do |code|
33
+ next if limit && i > limit ## for debugging just process first x entries
34
+
35
+ pages << read_page( code )
36
+ end
37
+ pages
38
+ end
39
+
40
+ private
41
+ def region_to_slug( text )
42
+ ## change and => n
43
+ ## change & => n
44
+ ## change all spaces to => -
45
+ ## e.g. East & Southeast Asia => east-n-southeast-asia
46
+ ## Central America and Caribbean => central-america-n-caribbean
47
+ text.downcase.gsub('and', 'n').gsub( '&', 'n' ).gsub( ' ', '-' )
48
+ end
49
+ end ## JsonPageReader
50
+
51
+ end # module Factbook
@@ -0,0 +1,307 @@
1
+
2
+ module Factbook
3
+
4
+ class Sanitizer
5
+ include LogUtils::Logging
6
+ include Utils ## e.g. find_page_info etc.
7
+
8
+ def sanitize( html )
9
+ ## todo: add option for (html source) encoding - why?? why not??
10
+
11
+ ## note:
12
+ ## returns 1) html profile withouth headers, footers, scripts,etc.
13
+ ## 2) page (meta) info e.g. country_name, country_code, last_updated, etc.
14
+ ## 3) errors e.g. list of errors e.g. endcoding errors (invalid byte sequence etc.)
15
+
16
+ page_info = PageInfo.new
17
+
18
+ ## todo:
19
+ ## make page info optional? why? why not?
20
+ ## not always available (if page structure changes) - check
21
+ ## what page info is required??
22
+ h = find_page_info( html )
23
+ if h
24
+ page_info.country_code = h[:country_code]
25
+ page_info.country_name = h[:country_name]
26
+ page_info.country_affiliation = h[:country_affiliation]
27
+ page_info.region_code = h[:region_code]
28
+ page_info.region_name = h[:region_name]
29
+ else
30
+ page_info.country_code = find_country_code( html )
31
+ ## print/warn: no page info found
32
+ end
33
+
34
+
35
+ page_info.last_updated = find_page_last_updated( html )
36
+
37
+
38
+ html_profile = find_country_profile( html ) ## cut-off headers, footers, scripts, etc.
39
+
40
+ ## todo/check: remove 3rd args old errors array - why? why not?
41
+ [html_profile, page_info, []]
42
+ end
43
+
44
+
45
+
46
+ def find_country_profile( html )
47
+ ####
48
+ ## remove header (everything before)
49
+ ## <ul class="expandcollapse">
50
+
51
+ ##
52
+ ## fix know broken html bugs
53
+ ## in co (Columbia) page (Nov/11 2020):
54
+ ## <div class="photogallery_captiontext">
55
+ ## <p>slightly less than twice the size of Texas</p
56
+ ## </div>
57
+ ## note: </p => unclosed p!! change to </p>
58
+
59
+ ## note: in regex use negative looakhead e.g. (?!patttern)
60
+ html = html.gsub( %r{</p(?![>])} ) do |m|
61
+ puts "!! WARN: fixing unclosed </p => </p>"
62
+ puts "#{m}"
63
+ '</p>'
64
+ end
65
+
66
+
67
+ ## note: replace all non-breaking spaces with spaces for now
68
+ ## see fr (france) in political parties section for example
69
+ html = html.gsub( "&nbsp;", ' ' )
70
+
71
+
72
+
73
+ doc = Nokogiri::HTML( html )
74
+
75
+ ul = doc.css( 'ul.expandcollapse' )[0]
76
+
77
+ puts ul.to_html[0..100]
78
+
79
+
80
+
81
+ ## note: special case cc uses h2 instead of div block
82
+ ## <h2 class="question cam_med" sectiontitle="Introduction" ccode="cc"
83
+ ## style="border-bottom: 2px solid white; cursor: pointer;">
84
+ ## Introduction :: <span class="region">CURACAO </span>
85
+ ## </h2>
86
+ ## is old format !!!!
87
+ ## cc - CURACAO
88
+ ## http headers says - last-modified: Wed, 14 Nov 2018 14:09:28 GMT
89
+ ## page says - PAGE LAST UPDATED ON MARCH 14, 2018
90
+ ## wait for new version to be generated / pushed!!!
91
+
92
+ ## check for old format if h2 are present
93
+ h2s = ul.css( 'h2' )
94
+ if h2s.size > 0
95
+ puts " !! WARN: found #{h2s.size} h2(s) - assume old format - sorry - must wait for update!!!"
96
+ ## return empty html string - why? why not?
97
+ return ''
98
+ end
99
+
100
+
101
+ ###
102
+ ## sanitize
103
+
104
+ ## remove link items
105
+ ## assume two <li>s are a section
106
+
107
+ html = String.new('')
108
+
109
+ ## filter all li's
110
+ ul_children = ul.children.select { |el| if el.name == 'li'
111
+ true
112
+ else
113
+ # puts "skipping #{el.name} >#{el.to_html}<"
114
+ false
115
+ end
116
+ }
117
+ ## ul_children = ul.css( 'li' )
118
+
119
+ puts " #{ul_children.size} li(s):"
120
+ ul_children.each_slice(2) do |lis|
121
+ li = lis[0]
122
+ div = li.at( 'div[sectiontitle]' )
123
+ if div.nil?
124
+ puts "!! ERROR: no section title found in div:"
125
+ puts li.to_html
126
+ exit 1
127
+ end
128
+
129
+ section_title = div['sectiontitle'].to_s
130
+
131
+ html << "<h2>#{section_title}</h2>\n"
132
+
133
+
134
+ li = lis[1]
135
+ ## filter all div's
136
+ li_children = li.children.select { |el| if el.name =='div'
137
+ true
138
+ else
139
+ # puts "skipping #{el.name} >#{el.to_html}<"
140
+ false
141
+ end
142
+ }
143
+ puts " #{li_children.size} div(s) in >#{section_title}<:"
144
+
145
+ li_children.each_slice(2) do |divs|
146
+ div = divs[0]
147
+ a = div.css('a')[0]
148
+
149
+ if a
150
+ subsection_title = a.text ## todo/check/rename: use field_name or such - why? why not?
151
+ html << "\n<h3>#{subsection_title}:</h3>\n"
152
+ else
153
+ subsection_title = '???'
154
+ puts "!! WARN: no anchor found:"
155
+ puts div.to_html
156
+ end
157
+
158
+
159
+ div = divs[1]
160
+ div_children = div.children.select {|el| el.name == 'div' ? true : false }
161
+ puts " #{div_children.size} div(s) in field >#{subsection_title}<:"
162
+
163
+ ## use more robust version - only get divs with category_data
164
+ ## div_children = div.css( 'div.category_data' )
165
+ ## puts " #{div_children.size} div(s) in field >#{subsection_title}< v2:"
166
+
167
+ # if div_children.size > 14
168
+ # ## us labor force has 11 divs
169
+ # ## possibly an error
170
+ # puts "!! ERROR - too many category_data divs found:"
171
+ # puts div.to_html[0..200]
172
+ # puts "\n...\n"
173
+ # puts puts div.to_html[-400..-1]
174
+ # exit 1
175
+ # end
176
+
177
+ div_children.each do |catdiv|
178
+ if catdiv['class'] && catdiv['class'].index( 'category_data' )
179
+
180
+ if catdiv['class'].index( 'attachment' )
181
+ ## skip attachments e.g. maps, pop pyramids, etc.
182
+ else
183
+ html << sanitize_data( catdiv, title: subsection_title )
184
+ html << "\n"
185
+ end
186
+ else
187
+ if catdiv.to_html.index( 'country comparison to the world' )
188
+ ## silently skip for now country comparision
189
+ else
190
+ puts "!! ERROR: div (W/O category_data class) in >#{subsection_title}<:"
191
+ puts catdiv.to_html
192
+ exit 1
193
+ end
194
+ end
195
+ end
196
+ end
197
+ end
198
+
199
+ html
200
+ end
201
+
202
+
203
+ #
204
+ # <span class="subfield-date" aria-label="Date of information: 2018">(2018)</span>
205
+ #
206
+ # remove aria labels
207
+ ARIA_ATTR_REGEX = /\s*
208
+ aria-label=('|").+?\1 ## note: use non-greedy match e.g. .+?
209
+ /xim ## do NOT allow multi-line - why? why not?
210
+
211
+ ## find double breaks e.g. <br><br>
212
+ BR_BR_REGEX = /(<br> \s* <br>)
213
+ /xim ## do NOT allow multi-line - why? why not?
214
+
215
+
216
+ def sanitize_data( el, title: )
217
+ ## todo/fix/check:
218
+ ## check if more than one p(aragraph)
219
+ ## get squezzed together without space inbetween?
220
+
221
+
222
+ ## step 0: replace all possible a(nchor) links with just inner text
223
+ el.css( 'a').each do |a|
224
+ a.replace( " #{a.text.strip} " )
225
+ end
226
+
227
+
228
+
229
+ inner_html = String.new('')
230
+
231
+ ## step 1 - unwrap paragraphs if present
232
+ ## and convert dom/nokogiri doc/tree to html string
233
+ p_count = 0
234
+ el.children.each do |child|
235
+ if child.name == 'p'
236
+ ## puts " [debug ] unwrap <p> no.#{p_count+1}"
237
+
238
+ p_inner_html = child.inner_html.strip ## note: unwrap! use inner_html NOT to_html/html
239
+ if p_inner_html.empty?
240
+ ## note: skip empty paragraphs for now
241
+ else
242
+ inner_html << ' ++ ' if p_count > 0
243
+ inner_html << p_inner_html
244
+ inner_html << " \n\n "
245
+
246
+ p_count += 1
247
+ end
248
+ else
249
+ inner_html << child.to_html
250
+ end
251
+ end
252
+ ## note: keep container div!! just replace inner html!!!
253
+ ## note: right strip all trailing spaces/newlines for now
254
+ ## plus add back a single one for pretty printing
255
+ el.inner_html = inner_html.rstrip + "\n"
256
+
257
+ # finally - convert back to html (string)
258
+ html = el.to_html
259
+
260
+
261
+
262
+ html = html.gsub( ARIA_ATTR_REGEX ) do |m|
263
+ ## do not report / keep silent for now
264
+ ## puts "in >#{title}< remove aria-label attr:"
265
+ ## puts "#{m}"
266
+ ''
267
+ end
268
+
269
+ html = html.gsub( BR_BR_REGEX ) do |m|
270
+ puts "in >#{title}< squish two <br>s into one:"
271
+ puts "#{m}"
272
+ '<br>'
273
+ end
274
+
275
+ html = html.gsub( /<br>/i ) do |m|
276
+ puts "in >#{title}< replace <br> with inline (plain) text ++:"
277
+ puts "#{m}"
278
+ ' ++ '
279
+ end
280
+
281
+ ## cleanup/remove ++ before subfield e.g.
282
+ ## of: ++ => of: or such
283
+ html = html.gsub( %r{
284
+ (?<=([a-z]:)|(:</span>)) # note: use zero-length positive lookbehind
285
+ \s+
286
+ \+{2}
287
+ \s+}xim ) do |m|
288
+ puts "in >#{title} remove ++ before <field>: marker:"
289
+ puts "#{m}"
290
+ ' '
291
+ end
292
+
293
+ #####
294
+ # "unfancy" smart quotes to ascii - why? why not?
295
+ # e.g.
296
+ # Following Britain’s victory => Following Britain's victory
297
+ html = html.tr( "’", "'" )
298
+
299
+
300
+ html
301
+ end
302
+
303
+
304
+
305
+ end # class Sanitizer
306
+
307
+ end # module Factbook
@@ -0,0 +1,29 @@
1
+ # encoding: utf-8
2
+
3
+ module Factbook
4
+
5
+
6
+ class Sect
7
+ include LogUtils::Logging
8
+
9
+ attr_accessor :title ## use name instead of title - why? why not?
10
+ attr_accessor :subsects
11
+
12
+ def initialize
13
+ @subsects = []
14
+ end
15
+
16
+ def data
17
+ ## convert sects to hash
18
+ @data = {}
19
+
20
+ subsects.each_with_index do |subsect,i|
21
+ @data[ subsect.title ] = subsect.data
22
+ end
23
+ @data
24
+ end
25
+
26
+
27
+ end # class Sect
28
+
29
+ end # module Factbook
@@ -0,0 +1,18 @@
1
+ # encoding: utf-8
2
+
3
+ module Factbook
4
+
5
+
6
+ class Subsect
7
+ include LogUtils::Logging
8
+
9
+ attr_accessor :title ## use name instead of title - why? why not?
10
+ attr_accessor :data ## hash holding data e.g. { 'text' => '...' etc. }
11
+
12
+ def initialize
13
+ @data = {}
14
+ end
15
+
16
+ end # class Subsect
17
+
18
+ end # module Factbook
@@ -0,0 +1,52 @@
1
+ # encoding: utf-8
2
+
3
+ module Factbook
4
+
5
+ ##
6
+ ## make more "generic" - why? why not?
7
+ ## (re)use for other files ?? move to textutils ??
8
+
9
+ ##
10
+ ## for now reads in rows with values separated by at least 3+ spaces e.g.:
11
+ ## see www.cia.gov/library/publications/the-world-factbook/rankorder/rawdata_2119.txt
12
+ ## 1 China 1,367,485,388
13
+ ## 2 India 1,251,695,584
14
+ ## 3 European Union 513,949,445
15
+ ## 4 United States 321,368,864
16
+ ## 5 Indonesia 255,993,674
17
+ ## 6 Brazil 204,259,812
18
+
19
+
20
+ class TableReader
21
+ include LogUtils::Logging
22
+
23
+
24
+ def initialize( text )
25
+ @text = text
26
+ end
27
+
28
+ def read
29
+ recs = []
30
+
31
+ line_no = 0
32
+ @text.each_line do |line|
33
+ line_no +=1
34
+ line = line.strip ## remove leading and trailing whitespace
35
+ if line.empty?
36
+ puts "** skipping empty line #{line_no}"
37
+ next
38
+ end
39
+
40
+ values = line.split( /[ ]{3,}/ ) ## split three or more spaces - use just two ?? why? why not??
41
+
42
+ ## puts line
43
+ ## pp values
44
+ recs << values
45
+ end
46
+ recs
47
+ end
48
+
49
+
50
+ end # class TableReader
51
+
52
+ end # module Factbook