factbook-readers 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (58) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +3 -0
  3. data/Manifest.txt +56 -0
  4. data/README.md +196 -0
  5. data/Rakefile +34 -0
  6. data/data/attributes.yml +337 -0
  7. data/data/categories.csv +164 -0
  8. data/data/codes.csv +262 -0
  9. data/data/codesxref.csv +280 -0
  10. data/data/comparisons.csv +75 -0
  11. data/lib/factbook-readers.rb +59 -0
  12. data/lib/factbook-readers/attributes.rb +74 -0
  13. data/lib/factbook-readers/builder.rb +212 -0
  14. data/lib/factbook-readers/builder_item.rb +185 -0
  15. data/lib/factbook-readers/builder_json.rb +79 -0
  16. data/lib/factbook-readers/codes.rb +122 -0
  17. data/lib/factbook-readers/comparisons.rb +50 -0
  18. data/lib/factbook-readers/counter.rb +48 -0
  19. data/lib/factbook-readers/normalize.rb +43 -0
  20. data/lib/factbook-readers/page.rb +148 -0
  21. data/lib/factbook-readers/page_info.rb +12 -0
  22. data/lib/factbook-readers/reader_json.rb +51 -0
  23. data/lib/factbook-readers/sanitizer.rb +307 -0
  24. data/lib/factbook-readers/sect.rb +29 -0
  25. data/lib/factbook-readers/subsect.rb +18 -0
  26. data/lib/factbook-readers/table.rb +52 -0
  27. data/lib/factbook-readers/utils.rb +47 -0
  28. data/lib/factbook-readers/utils_info.rb +129 -0
  29. data/lib/factbook-readers/version.rb +24 -0
  30. data/lib/factbook/readers.rb +5 -0
  31. data/test/data/au.html +579 -0
  32. data/test/data/au.yml +8 -0
  33. data/test/data/be.html +596 -0
  34. data/test/data/be.yml +8 -0
  35. data/test/data/json/au.json +892 -0
  36. data/test/data/src/ag.html +716 -0
  37. data/test/data/src/au-2015-09-24.html +2006 -0
  38. data/test/data/src/au.html +658 -0
  39. data/test/data/src/be-2015-09-24.html +2011 -0
  40. data/test/data/src/be.html +648 -0
  41. data/test/helper.rb +11 -0
  42. data/test/test_attribs.rb +87 -0
  43. data/test/test_attribs_def.rb +20 -0
  44. data/test/test_builder.rb +35 -0
  45. data/test/test_codes.rb +76 -0
  46. data/test/test_comparisons.rb +19 -0
  47. data/test/test_convert.rb +30 -0
  48. data/test/test_counter.rb +31 -0
  49. data/test/test_fields.rb +52 -0
  50. data/test/test_importer.rb +56 -0
  51. data/test/test_item_builder.rb +99 -0
  52. data/test/test_json.rb +45 -0
  53. data/test/test_json_builder.rb +25 -0
  54. data/test/test_normalize.rb +23 -0
  55. data/test/test_page.rb +38 -0
  56. data/test/test_sanitizer.rb +39 -0
  57. data/test/test_sanitizer_regex.rb +89 -0
  58. metadata +196 -0
@@ -0,0 +1,12 @@
1
+ # encoding: utf-8
2
+
3
+ module Factbook
4
+
5
+ PageInfo = Struct.new( :country_code,
6
+ :country_name,
7
+ :country_affiliation,
8
+ :region_code,
9
+ :region_name,
10
+ :last_updated )
11
+
12
+ end # module Factbook
@@ -0,0 +1,51 @@
1
+ # encoding: utf-8
2
+
3
+ module Factbook
4
+
5
+
6
+ class JsonPageReader
7
+ def initialize( json_dir )
8
+ @json_dir = json_dir
9
+ end
10
+
11
+ def read_page( code )
12
+ path = "#{@json_dir}/#{region_to_slug(code.region)}/#{code.code}.json"
13
+
14
+ puts "reading #{code.code} #{code.name} (#{code.region}) [#{path}]..."
15
+ json = File.read( path )
16
+
17
+ ## todo/fix/quick hack: for now until we have a proper header/meta/info section in json
18
+ # add some page info from code struct
19
+
20
+ info = PageInfo.new
21
+ info.country_code = code.code
22
+ info.country_name = code.name
23
+ info.region_name = code.region
24
+
25
+ page = Page.new( code.code, json: json, info: info )
26
+ page
27
+ end
28
+
29
+ def read_pages( codes, limit: nil )
30
+ pages = []
31
+ i=0
32
+ codes.each do |code|
33
+ next if limit && i > limit ## for debugging just process first x entries
34
+
35
+ pages << read_page( code )
36
+ end
37
+ pages
38
+ end
39
+
40
+ private
41
+ def region_to_slug( text )
42
+ ## change and => n
43
+ ## change & => n
44
+ ## change all spaces to => -
45
+ ## e.g. East & Southeast Asia => east-n-southeast-asia
46
+ ## Central America and Caribbean => central-america-n-caribbean
47
+ text.downcase.gsub('and', 'n').gsub( '&', 'n' ).gsub( ' ', '-' )
48
+ end
49
+ end ## JsonPageReader
50
+
51
+ end # module Factbook
@@ -0,0 +1,307 @@
1
+
2
+ module Factbook
3
+
4
+ class Sanitizer
5
+ include LogUtils::Logging
6
+ include Utils ## e.g. find_page_info etc.
7
+
8
+ def sanitize( html )
9
+ ## todo: add option for (html source) encoding - why?? why not??
10
+
11
+ ## note:
12
+ ## returns 1) html profile withouth headers, footers, scripts,etc.
13
+ ## 2) page (meta) info e.g. country_name, country_code, last_updated, etc.
14
+ ## 3) errors e.g. list of errors e.g. endcoding errors (invalid byte sequence etc.)
15
+
16
+ page_info = PageInfo.new
17
+
18
+ ## todo:
19
+ ## make page info optional? why? why not?
20
+ ## not always available (if page structure changes) - check
21
+ ## what page info is required??
22
+ h = find_page_info( html )
23
+ if h
24
+ page_info.country_code = h[:country_code]
25
+ page_info.country_name = h[:country_name]
26
+ page_info.country_affiliation = h[:country_affiliation]
27
+ page_info.region_code = h[:region_code]
28
+ page_info.region_name = h[:region_name]
29
+ else
30
+ page_info.country_code = find_country_code( html )
31
+ ## print/warn: no page info found
32
+ end
33
+
34
+
35
+ page_info.last_updated = find_page_last_updated( html )
36
+
37
+
38
+ html_profile = find_country_profile( html ) ## cut-off headers, footers, scripts, etc.
39
+
40
+ ## todo/check: remove 3rd args old errors array - why? why not?
41
+ [html_profile, page_info, []]
42
+ end
43
+
44
+
45
+
46
+ def find_country_profile( html )
47
+ ####
48
+ ## remove header (everything before)
49
+ ## <ul class="expandcollapse">
50
+
51
+ ##
52
+ ## fix know broken html bugs
53
+ ## in co (Columbia) page (Nov/11 2020):
54
+ ## <div class="photogallery_captiontext">
55
+ ## <p>slightly less than twice the size of Texas</p
56
+ ## </div>
57
+ ## note: </p => unclosed p!! change to </p>
58
+
59
+ ## note: in regex use negative looakhead e.g. (?!patttern)
60
+ html = html.gsub( %r{</p(?![>])} ) do |m|
61
+ puts "!! WARN: fixing unclosed </p => </p>"
62
+ puts "#{m}"
63
+ '</p>'
64
+ end
65
+
66
+
67
+ ## note: replace all non-breaking spaces with spaces for now
68
+ ## see fr (france) in political parties section for example
69
+ html = html.gsub( "&nbsp;", ' ' )
70
+
71
+
72
+
73
+ doc = Nokogiri::HTML( html )
74
+
75
+ ul = doc.css( 'ul.expandcollapse' )[0]
76
+
77
+ puts ul.to_html[0..100]
78
+
79
+
80
+
81
+ ## note: special case cc uses h2 instead of div block
82
+ ## <h2 class="question cam_med" sectiontitle="Introduction" ccode="cc"
83
+ ## style="border-bottom: 2px solid white; cursor: pointer;">
84
+ ## Introduction :: <span class="region">CURACAO </span>
85
+ ## </h2>
86
+ ## is old format !!!!
87
+ ## cc - CURACAO
88
+ ## http headers says - last-modified: Wed, 14 Nov 2018 14:09:28 GMT
89
+ ## page says - PAGE LAST UPDATED ON MARCH 14, 2018
90
+ ## wait for new version to be generated / pushed!!!
91
+
92
+ ## check for old format if h2 are present
93
+ h2s = ul.css( 'h2' )
94
+ if h2s.size > 0
95
+ puts " !! WARN: found #{h2s.size} h2(s) - assume old format - sorry - must wait for update!!!"
96
+ ## return empty html string - why? why not?
97
+ return ''
98
+ end
99
+
100
+
101
+ ###
102
+ ## sanitize
103
+
104
+ ## remove link items
105
+ ## assume two <li>s are a section
106
+
107
+ html = String.new('')
108
+
109
+ ## filter all li's
110
+ ul_children = ul.children.select { |el| if el.name == 'li'
111
+ true
112
+ else
113
+ # puts "skipping #{el.name} >#{el.to_html}<"
114
+ false
115
+ end
116
+ }
117
+ ## ul_children = ul.css( 'li' )
118
+
119
+ puts " #{ul_children.size} li(s):"
120
+ ul_children.each_slice(2) do |lis|
121
+ li = lis[0]
122
+ div = li.at( 'div[sectiontitle]' )
123
+ if div.nil?
124
+ puts "!! ERROR: no section title found in div:"
125
+ puts li.to_html
126
+ exit 1
127
+ end
128
+
129
+ section_title = div['sectiontitle'].to_s
130
+
131
+ html << "<h2>#{section_title}</h2>\n"
132
+
133
+
134
+ li = lis[1]
135
+ ## filter all div's
136
+ li_children = li.children.select { |el| if el.name =='div'
137
+ true
138
+ else
139
+ # puts "skipping #{el.name} >#{el.to_html}<"
140
+ false
141
+ end
142
+ }
143
+ puts " #{li_children.size} div(s) in >#{section_title}<:"
144
+
145
+ li_children.each_slice(2) do |divs|
146
+ div = divs[0]
147
+ a = div.css('a')[0]
148
+
149
+ if a
150
+ subsection_title = a.text ## todo/check/rename: use field_name or such - why? why not?
151
+ html << "\n<h3>#{subsection_title}:</h3>\n"
152
+ else
153
+ subsection_title = '???'
154
+ puts "!! WARN: no anchor found:"
155
+ puts div.to_html
156
+ end
157
+
158
+
159
+ div = divs[1]
160
+ div_children = div.children.select {|el| el.name == 'div' ? true : false }
161
+ puts " #{div_children.size} div(s) in field >#{subsection_title}<:"
162
+
163
+ ## use more robust version - only get divs with category_data
164
+ ## div_children = div.css( 'div.category_data' )
165
+ ## puts " #{div_children.size} div(s) in field >#{subsection_title}< v2:"
166
+
167
+ # if div_children.size > 14
168
+ # ## us labor force has 11 divs
169
+ # ## possibly an error
170
+ # puts "!! ERROR - too many category_data divs found:"
171
+ # puts div.to_html[0..200]
172
+ # puts "\n...\n"
173
+ # puts puts div.to_html[-400..-1]
174
+ # exit 1
175
+ # end
176
+
177
+ div_children.each do |catdiv|
178
+ if catdiv['class'] && catdiv['class'].index( 'category_data' )
179
+
180
+ if catdiv['class'].index( 'attachment' )
181
+ ## skip attachments e.g. maps, pop pyramids, etc.
182
+ else
183
+ html << sanitize_data( catdiv, title: subsection_title )
184
+ html << "\n"
185
+ end
186
+ else
187
+ if catdiv.to_html.index( 'country comparison to the world' )
188
+ ## silently skip for now country comparision
189
+ else
190
+ puts "!! ERROR: div (W/O category_data class) in >#{subsection_title}<:"
191
+ puts catdiv.to_html
192
+ exit 1
193
+ end
194
+ end
195
+ end
196
+ end
197
+ end
198
+
199
+ html
200
+ end
201
+
202
+
203
+ #
204
+ # <span class="subfield-date" aria-label="Date of information: 2018">(2018)</span>
205
+ #
206
+ # remove aria labels
207
+ ARIA_ATTR_REGEX = /\s*
208
+ aria-label=('|").+?\1 ## note: use non-greedy match e.g. .+?
209
+ /xim ## do NOT allow multi-line - why? why not?
210
+
211
+ ## find double breaks e.g. <br><br>
212
+ BR_BR_REGEX = /(<br> \s* <br>)
213
+ /xim ## do NOT allow multi-line - why? why not?
214
+
215
+
216
+ def sanitize_data( el, title: )
217
+ ## todo/fix/check:
218
+ ## check if more than one p(aragraph)
219
+ ## get squezzed together without space inbetween?
220
+
221
+
222
+ ## step 0: replace all possible a(nchor) links with just inner text
223
+ el.css( 'a').each do |a|
224
+ a.replace( " #{a.text.strip} " )
225
+ end
226
+
227
+
228
+
229
+ inner_html = String.new('')
230
+
231
+ ## step 1 - unwrap paragraphs if present
232
+ ## and convert dom/nokogiri doc/tree to html string
233
+ p_count = 0
234
+ el.children.each do |child|
235
+ if child.name == 'p'
236
+ ## puts " [debug ] unwrap <p> no.#{p_count+1}"
237
+
238
+ p_inner_html = child.inner_html.strip ## note: unwrap! use inner_html NOT to_html/html
239
+ if p_inner_html.empty?
240
+ ## note: skip empty paragraphs for now
241
+ else
242
+ inner_html << ' ++ ' if p_count > 0
243
+ inner_html << p_inner_html
244
+ inner_html << " \n\n "
245
+
246
+ p_count += 1
247
+ end
248
+ else
249
+ inner_html << child.to_html
250
+ end
251
+ end
252
+ ## note: keep container div!! just replace inner html!!!
253
+ ## note: right strip all trailing spaces/newlines for now
254
+ ## plus add back a single one for pretty printing
255
+ el.inner_html = inner_html.rstrip + "\n"
256
+
257
+ # finally - convert back to html (string)
258
+ html = el.to_html
259
+
260
+
261
+
262
+ html = html.gsub( ARIA_ATTR_REGEX ) do |m|
263
+ ## do not report / keep silent for now
264
+ ## puts "in >#{title}< remove aria-label attr:"
265
+ ## puts "#{m}"
266
+ ''
267
+ end
268
+
269
+ html = html.gsub( BR_BR_REGEX ) do |m|
270
+ puts "in >#{title}< squish two <br>s into one:"
271
+ puts "#{m}"
272
+ '<br>'
273
+ end
274
+
275
+ html = html.gsub( /<br>/i ) do |m|
276
+ puts "in >#{title}< replace <br> with inline (plain) text ++:"
277
+ puts "#{m}"
278
+ ' ++ '
279
+ end
280
+
281
+ ## cleanup/remove ++ before subfield e.g.
282
+ ## of: ++ => of: or such
283
+ html = html.gsub( %r{
284
+ (?<=([a-z]:)|(:</span>)) # note: use zero-length positive lookbehind
285
+ \s+
286
+ \+{2}
287
+ \s+}xim ) do |m|
288
+ puts "in >#{title} remove ++ before <field>: marker:"
289
+ puts "#{m}"
290
+ ' '
291
+ end
292
+
293
+ #####
294
+ # "unfancy" smart quotes to ascii - why? why not?
295
+ # e.g.
296
+ # Following Britain’s victory => Following Britain's victory
297
+ html = html.tr( "’", "'" )
298
+
299
+
300
+ html
301
+ end
302
+
303
+
304
+
305
+ end # class Sanitizer
306
+
307
+ end # module Factbook
@@ -0,0 +1,29 @@
1
+ # encoding: utf-8
2
+
3
+ module Factbook
4
+
5
+
6
+ class Sect
7
+ include LogUtils::Logging
8
+
9
+ attr_accessor :title ## use name instead of title - why? why not?
10
+ attr_accessor :subsects
11
+
12
+ def initialize
13
+ @subsects = []
14
+ end
15
+
16
+ def data
17
+ ## convert sects to hash
18
+ @data = {}
19
+
20
+ subsects.each_with_index do |subsect,i|
21
+ @data[ subsect.title ] = subsect.data
22
+ end
23
+ @data
24
+ end
25
+
26
+
27
+ end # class Sect
28
+
29
+ end # module Factbook
@@ -0,0 +1,18 @@
1
+ # encoding: utf-8
2
+
3
+ module Factbook
4
+
5
+
6
+ class Subsect
7
+ include LogUtils::Logging
8
+
9
+ attr_accessor :title ## use name instead of title - why? why not?
10
+ attr_accessor :data ## hash holding data e.g. { 'text' => '...' etc. }
11
+
12
+ def initialize
13
+ @data = {}
14
+ end
15
+
16
+ end # class Subsect
17
+
18
+ end # module Factbook
@@ -0,0 +1,52 @@
1
+ # encoding: utf-8
2
+
3
+ module Factbook
4
+
5
+ ##
6
+ ## make more "generic" - why? why not?
7
+ ## (re)use for other files ?? move to textutils ??
8
+
9
+ ##
10
+ ## for now reads in rows with values separated by at least 3+ spaces e.g.:
11
+ ## see www.cia.gov/library/publications/the-world-factbook/rankorder/rawdata_2119.txt
12
+ ## 1 China 1,367,485,388
13
+ ## 2 India 1,251,695,584
14
+ ## 3 European Union 513,949,445
15
+ ## 4 United States 321,368,864
16
+ ## 5 Indonesia 255,993,674
17
+ ## 6 Brazil 204,259,812
18
+
19
+
20
+ class TableReader
21
+ include LogUtils::Logging
22
+
23
+
24
+ def initialize( text )
25
+ @text = text
26
+ end
27
+
28
+ def read
29
+ recs = []
30
+
31
+ line_no = 0
32
+ @text.each_line do |line|
33
+ line_no +=1
34
+ line = line.strip ## remove leading and trailing whitespace
35
+ if line.empty?
36
+ puts "** skipping empty line #{line_no}"
37
+ next
38
+ end
39
+
40
+ values = line.split( /[ ]{3,}/ ) ## split three or more spaces - use just two ?? why? why not??
41
+
42
+ ## puts line
43
+ ## pp values
44
+ recs << values
45
+ end
46
+ recs
47
+ end
48
+
49
+
50
+ end # class TableReader
51
+
52
+ end # module Factbook