factbook-readers 0.0.1 → 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Manifest.txt +0 -16
- data/README.md +13 -14
- data/data/codes.csv +1 -1
- data/lib/factbook-readers.rb +11 -12
- data/lib/factbook-readers/builder.rb +28 -53
- data/lib/factbook-readers/builder_json.rb +9 -20
- data/lib/factbook-readers/codes.rb +3 -2
- data/lib/factbook-readers/comparisons.rb +2 -2
- data/lib/factbook-readers/page.rb +59 -85
- data/lib/factbook-readers/sanitizer.rb +13 -34
- data/lib/factbook-readers/version.rb +2 -2
- data/test/helper.rb +1 -5
- data/test/test_builder.rb +1 -6
- data/test/test_codes.rb +5 -9
- data/test/test_comparisons.rb +2 -5
- data/test/test_counter.rb +4 -6
- data/test/test_fields.rb +0 -2
- data/test/test_item_builder.rb +7 -9
- data/test/test_json.rb +1 -3
- data/test/test_json_builder.rb +1 -3
- data/test/test_normalize.rb +0 -2
- data/test/test_page.rb +2 -4
- data/test/test_sanitizer.rb +2 -5
- data/test/test_sanitizer_regex.rb +0 -2
- metadata +2 -18
- data/data/attributes.yml +0 -337
- data/lib/factbook-readers/attributes.rb +0 -74
- data/test/data/au.html +0 -579
- data/test/data/au.yml +0 -8
- data/test/data/be.html +0 -596
- data/test/data/be.yml +0 -8
- data/test/data/json/au.json +0 -892
- data/test/data/src/ag.html +0 -716
- data/test/data/src/au-2015-09-24.html +0 -2006
- data/test/data/src/au.html +0 -658
- data/test/data/src/be-2015-09-24.html +0 -2011
- data/test/data/src/be.html +0 -648
- data/test/test_attribs.rb +0 -87
- data/test/test_attribs_def.rb +0 -20
- data/test/test_convert.rb +0 -30
- data/test/test_importer.rb +0 -56
@@ -64,12 +64,6 @@ def find_country_profile( html )
|
|
64
64
|
end
|
65
65
|
|
66
66
|
|
67
|
-
## note: replace all non-breaking spaces with spaces for now
|
68
|
-
## see fr (france) in political parties section for example
|
69
|
-
html = html.gsub( " ", ' ' )
|
70
|
-
|
71
|
-
|
72
|
-
|
73
67
|
doc = Nokogiri::HTML( html )
|
74
68
|
|
75
69
|
ul = doc.css( 'ul.expandcollapse' )[0]
|
@@ -77,30 +71,8 @@ def find_country_profile( html )
|
|
77
71
|
puts ul.to_html[0..100]
|
78
72
|
|
79
73
|
|
80
|
-
|
81
|
-
## note: special case cc uses h2 instead of div block
|
82
|
-
## <h2 class="question cam_med" sectiontitle="Introduction" ccode="cc"
|
83
|
-
## style="border-bottom: 2px solid white; cursor: pointer;">
|
84
|
-
## Introduction :: <span class="region">CURACAO </span>
|
85
|
-
## </h2>
|
86
|
-
## is old format !!!!
|
87
|
-
## cc - CURACAO
|
88
|
-
## http headers says - last-modified: Wed, 14 Nov 2018 14:09:28 GMT
|
89
|
-
## page says - PAGE LAST UPDATED ON MARCH 14, 2018
|
90
|
-
## wait for new version to be generated / pushed!!!
|
91
|
-
|
92
|
-
## check for old format if h2 are present
|
93
|
-
h2s = ul.css( 'h2' )
|
94
|
-
if h2s.size > 0
|
95
|
-
puts " !! WARN: found #{h2s.size} h2(s) - assume old format - sorry - must wait for update!!!"
|
96
|
-
## return empty html string - why? why not?
|
97
|
-
return ''
|
98
|
-
end
|
99
|
-
|
100
|
-
|
101
74
|
###
|
102
75
|
## sanitize
|
103
|
-
|
104
76
|
## remove link items
|
105
77
|
## assume two <li>s are a section
|
106
78
|
|
@@ -204,12 +176,12 @@ end
|
|
204
176
|
# <span class="subfield-date" aria-label="Date of information: 2018">(2018)</span>
|
205
177
|
#
|
206
178
|
# remove aria labels
|
207
|
-
|
179
|
+
ARIA_ATTR_RE = /\s*
|
208
180
|
aria-label=('|").+?\1 ## note: use non-greedy match e.g. .+?
|
209
181
|
/xim ## do NOT allow multi-line - why? why not?
|
210
182
|
|
211
183
|
## find double breaks e.g. <br><br>
|
212
|
-
|
184
|
+
BR_BR_RE = /(<br> \s* <br>)
|
213
185
|
/xim ## do NOT allow multi-line - why? why not?
|
214
186
|
|
215
187
|
|
@@ -252,6 +224,12 @@ def sanitize_data( el, title: )
|
|
252
224
|
## note: keep container div!! just replace inner html!!!
|
253
225
|
## note: right strip all trailing spaces/newlines for now
|
254
226
|
## plus add back a single one for pretty printing
|
227
|
+
|
228
|
+
## note: replace all non-breaking spaces with spaces for now
|
229
|
+
## see fr (france) in political parties section for example
|
230
|
+
## todo/check/fix: check if we need to use unicode char!! and NOT html entity
|
231
|
+
inner_html = inner_html.gsub( " ", ' ' )
|
232
|
+
|
255
233
|
el.inner_html = inner_html.rstrip + "\n"
|
256
234
|
|
257
235
|
# finally - convert back to html (string)
|
@@ -259,14 +237,14 @@ def sanitize_data( el, title: )
|
|
259
237
|
|
260
238
|
|
261
239
|
|
262
|
-
html = html.gsub(
|
240
|
+
html = html.gsub( ARIA_ATTR_RE ) do |m|
|
263
241
|
## do not report / keep silent for now
|
264
242
|
## puts "in >#{title}< remove aria-label attr:"
|
265
243
|
## puts "#{m}"
|
266
244
|
''
|
267
245
|
end
|
268
246
|
|
269
|
-
html = html.gsub(
|
247
|
+
html = html.gsub( BR_BR_RE ) do |m|
|
270
248
|
puts "in >#{title}< squish two <br>s into one:"
|
271
249
|
puts "#{m}"
|
272
250
|
'<br>'
|
@@ -280,11 +258,12 @@ def sanitize_data( el, title: )
|
|
280
258
|
|
281
259
|
## cleanup/remove ++ before subfield e.g.
|
282
260
|
## of: ++ => of: or such
|
261
|
+
##
|
262
|
+
## todo/fix: add negative lookahead e.g. not another + to be more specific!!
|
283
263
|
html = html.gsub( %r{
|
284
264
|
(?<=([a-z]:)|(:</span>)) # note: use zero-length positive lookbehind
|
285
265
|
\s+
|
286
|
-
\+{2}
|
287
|
-
\s+}xim ) do |m|
|
266
|
+
\+{2}}xim ) do |m|
|
288
267
|
puts "in >#{title} remove ++ before <field>: marker:"
|
289
268
|
puts "#{m}"
|
290
269
|
' '
|
data/test/helper.rb
CHANGED
data/test/test_builder.rb
CHANGED
@@ -1,5 +1,3 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
1
|
###
|
4
2
|
# to run use
|
5
3
|
# ruby -I ./lib -I ./test test/test_builder.rb
|
@@ -8,14 +6,11 @@
|
|
8
6
|
require 'helper'
|
9
7
|
|
10
8
|
|
11
|
-
##
|
12
|
-
## use/fix: ASCII-8BIT (e.g.keep as is)
|
13
|
-
|
14
9
|
|
15
10
|
class TestBuilder < MiniTest::Test
|
16
11
|
|
17
12
|
def test_build
|
18
|
-
|
13
|
+
|
19
14
|
['au','be'].each do |code|
|
20
15
|
## use/fix: ASCII-8BIT (e.g.keep as is) -???
|
21
16
|
## fix/todo: use ASCII8BIT/binary reader ??
|
data/test/test_codes.rb
CHANGED
@@ -1,5 +1,3 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
1
|
###
|
4
2
|
# to run use
|
5
3
|
# ruby -I ./lib -I ./test test/test_codes.rb
|
@@ -12,9 +10,7 @@ class TestCodes < MiniTest::Test
|
|
12
10
|
|
13
11
|
|
14
12
|
def test_codes
|
15
|
-
|
16
|
-
assert_equal 261, Factbook::CODES.size
|
17
|
-
assert_equal 261, Factbook.codes.size
|
13
|
+
assert_equal 261, Factbook.codes.size
|
18
14
|
assert_equal 261, Factbook.codes.to_a.size
|
19
15
|
|
20
16
|
|
@@ -27,7 +23,7 @@ class TestCodes < MiniTest::Test
|
|
27
23
|
|
28
24
|
assert_equal 8, Factbook.codes.dependencies_us.size
|
29
25
|
|
30
|
-
|
26
|
+
|
31
27
|
assert_equal 55, Factbook.codes.europe.size
|
32
28
|
assert_equal 9, Factbook.codes.south_asia.size
|
33
29
|
assert_equal 6, Factbook.codes.central_asia.size
|
@@ -43,11 +39,11 @@ class TestCodes < MiniTest::Test
|
|
43
39
|
assert_equal 1, Factbook.codes.region('World').size
|
44
40
|
|
45
41
|
assert_equal 45, Factbook.codes.countries.europe.size
|
46
|
-
|
42
|
+
|
47
43
|
assert_equal Factbook.codes.category('Oceans').size, Factbook.codes.region('Oceans').size
|
48
44
|
assert_equal Factbook.codes.category('World').size, Factbook.codes.region('World').size
|
49
45
|
|
50
|
-
|
46
|
+
|
51
47
|
assert_equal 261, Factbook.codes.countries.size +
|
52
48
|
Factbook.codes.others.size +
|
53
49
|
Factbook.codes.dependencies.size +
|
@@ -68,7 +64,7 @@ class TestCodes < MiniTest::Test
|
|
68
64
|
Factbook.codes.antartica.size +
|
69
65
|
Factbook.codes.region('Oceans').size +
|
70
66
|
Factbook.codes.region('World').size
|
71
|
-
|
67
|
+
|
72
68
|
end
|
73
69
|
|
74
70
|
end # class TestCodes
|
data/test/test_comparisons.rb
CHANGED
@@ -1,5 +1,3 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
1
|
###
|
4
2
|
# to run use
|
5
3
|
# ruby -I ./lib -I ./test test/test_comparisons.rb
|
@@ -11,9 +9,8 @@ require 'helper'
|
|
11
9
|
class TestComparisons < MiniTest::Test
|
12
10
|
|
13
11
|
def test_comparisons
|
14
|
-
assert_equal 74, Factbook
|
15
|
-
assert_equal 74, Factbook.comparisons.size
|
16
|
-
assert_equal 74, Factbook.comparisons.to_a.size
|
12
|
+
assert_equal 74, Factbook.comparisons.size
|
13
|
+
assert_equal 74, Factbook.comparisons.to_a.size
|
17
14
|
end
|
18
15
|
|
19
16
|
end # class TestComparisons
|
data/test/test_counter.rb
CHANGED
@@ -1,5 +1,3 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
1
|
###
|
4
2
|
# to run use
|
5
3
|
# ruby -I ./lib -I ./test test/test_counter.rb
|
@@ -16,16 +14,16 @@ class TestCounter < MiniTest::Test
|
|
16
14
|
|
17
15
|
def test_counter
|
18
16
|
c = Factbook::Counter.new
|
19
|
-
|
17
|
+
|
20
18
|
codes = %w(au be)
|
21
19
|
codes.each do |code|
|
22
20
|
c.count( read_test_page( code )) # use builtin test page (do NOT fetch via internet)
|
23
21
|
end
|
24
|
-
|
22
|
+
|
25
23
|
h = c.data
|
26
24
|
pp h
|
27
|
-
|
28
|
-
assert true ## assume everything ok if we get here
|
25
|
+
|
26
|
+
assert true ## assume everything ok if we get here
|
29
27
|
end
|
30
28
|
|
31
29
|
end # class TestCounter
|
data/test/test_fields.rb
CHANGED
data/test/test_item_builder.rb
CHANGED
@@ -1,5 +1,3 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
1
|
###
|
4
2
|
# to run use
|
5
3
|
# ruby -I ./lib -I ./test test/test_item_builder.rb
|
@@ -15,10 +13,10 @@ class TestItemBuilder < MiniTest::Test
|
|
15
13
|
html =<<EOS
|
16
14
|
<div class=category_data>Central Europe, north of Italy and Slovenia</div>
|
17
15
|
EOS
|
18
|
-
|
16
|
+
|
19
17
|
b = Factbook::ItemBuilder.new( html, 'Location' )
|
20
18
|
b.read
|
21
|
-
|
19
|
+
|
22
20
|
assert true ## assume everthing ok
|
23
21
|
end
|
24
22
|
|
@@ -31,7 +29,7 @@ EOS
|
|
31
29
|
|
32
30
|
b = Factbook::ItemBuilder.new( html, 'Area' )
|
33
31
|
b.read
|
34
|
-
|
32
|
+
|
35
33
|
assert true ## assume everthing ok
|
36
34
|
end
|
37
35
|
|
@@ -45,7 +43,7 @@ EOS
|
|
45
43
|
|
46
44
|
b = Factbook::ItemBuilder.new( html, 'Land use' )
|
47
45
|
b.read
|
48
|
-
|
46
|
+
|
49
47
|
assert true ## assume everthing ok
|
50
48
|
end
|
51
49
|
|
@@ -57,7 +55,7 @@ EOS
|
|
57
55
|
|
58
56
|
b = Factbook::ItemBuilder.new( html, 'Contraceptive Prevalence Rate' )
|
59
57
|
b.read
|
60
|
-
|
58
|
+
|
61
59
|
assert true ## assume everthing ok
|
62
60
|
end
|
63
61
|
|
@@ -75,7 +73,7 @@ EOS
|
|
75
73
|
|
76
74
|
b = Factbook::ItemBuilder.new( html, 'Drinking Water Source' )
|
77
75
|
b.read
|
78
|
-
|
76
|
+
|
79
77
|
assert true ## assume everthing ok
|
80
78
|
end
|
81
79
|
|
@@ -91,7 +89,7 @@ EOS
|
|
91
89
|
|
92
90
|
b = Factbook::ItemBuilder.new( html, 'Political pressure groups and leaders' )
|
93
91
|
b.read
|
94
|
-
|
92
|
+
|
95
93
|
assert true ## assume everthing ok
|
96
94
|
end
|
97
95
|
|
data/test/test_json.rb
CHANGED
@@ -1,5 +1,3 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
1
|
###
|
4
2
|
# to run use
|
5
3
|
# ruby -I ./lib -I ./test test/test_json.rb
|
@@ -35,7 +33,7 @@ class TestJson < MiniTest::Test
|
|
35
33
|
|
36
34
|
### save to json
|
37
35
|
puts "saving a copy to #{code}.json for debugging"
|
38
|
-
File.open( "tmp/#{code}.json", 'w' ) do |f|
|
36
|
+
File.open( "tmp/#{code}.json", 'w:utf-8' ) do |f|
|
39
37
|
f.write JSON.pretty_generate( h )
|
40
38
|
## f.write page.to_json
|
41
39
|
end
|
data/test/test_json_builder.rb
CHANGED
@@ -1,5 +1,3 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
1
|
###
|
4
2
|
# to run use
|
5
3
|
# ruby -I ./lib -I ./test test/test_json_builder.rb
|
@@ -13,7 +11,7 @@ class TestJsonBuilder < MiniTest::Test
|
|
13
11
|
def test_read
|
14
12
|
code = 'au'
|
15
13
|
b = Factbook::JsonBuilder.from_file( "#{Factbook.root}/test/data/json/#{code}.json" )
|
16
|
-
|
14
|
+
|
17
15
|
assert_equal 10, b.sects.size
|
18
16
|
assert_equal 1, b.sects[0].subsects.size ## e.g. Introduction/Background
|
19
17
|
assert_equal 'Central Europe, north of Italy and Slovenia', b.json['Geography']['Location']['text']
|
data/test/test_normalize.rb
CHANGED
data/test/test_page.rb
CHANGED
@@ -1,5 +1,3 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
1
|
###
|
4
2
|
# to run use
|
5
3
|
# ruby -I ./lib -I ./test test/test_page.rb
|
@@ -22,14 +20,14 @@ class TestPage < MiniTest::Test
|
|
22
20
|
# [ 'ls', 9 ],
|
23
21
|
# [ 'vt', 8 ],
|
24
22
|
]
|
25
|
-
|
23
|
+
|
26
24
|
pages.each do |rec|
|
27
25
|
code = rec[0]
|
28
26
|
sects_size = rec[1]
|
29
27
|
|
30
28
|
html = File.read( "#{Factbook.root}/test/data/src/#{code}.html" )
|
31
29
|
page = Factbook::Page.new( code, html: html )
|
32
|
-
|
30
|
+
|
33
31
|
assert_equal sects_size, page.sects.size
|
34
32
|
end
|
35
33
|
end
|
data/test/test_sanitizer.rb
CHANGED
@@ -1,5 +1,3 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
1
|
###
|
4
2
|
# to run use
|
5
3
|
# ruby -I ./lib -I ./test test/test_sanitizer.rb
|
@@ -18,10 +16,9 @@ class TestSanitizer < MiniTest::Test
|
|
18
16
|
## ['au'].each do |cnty|
|
19
17
|
['au','ag','be'].each do |cnty|
|
20
18
|
|
21
|
-
|
22
|
-
html_ascii = File.read( "#{Factbook.root}/test/data/src/#{cnty}.html" ) ## fix/todo: use ASCII8BIT/binary reader ??
|
19
|
+
html_original = File.read( "#{Factbook.root}/test/data/src/#{cnty}.html", 'r:utf-8' ) { |f| r.read }
|
23
20
|
|
24
|
-
html, info, errors = Factbook::Sanitizer.new.sanitize(
|
21
|
+
html, info, errors = Factbook::Sanitizer.new.sanitize( html_original )
|
25
22
|
|
26
23
|
File.open( "./tmp/#{cnty}.profile.html", 'w' ) do |f|
|
27
24
|
f.write "** info:\n"
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: factbook-readers
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0
|
4
|
+
version: 1.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Gerald Bauer
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-11-
|
11
|
+
date: 2020-11-27 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: logutils
|
@@ -114,13 +114,11 @@ files:
|
|
114
114
|
- Manifest.txt
|
115
115
|
- README.md
|
116
116
|
- Rakefile
|
117
|
-
- data/attributes.yml
|
118
117
|
- data/categories.csv
|
119
118
|
- data/codes.csv
|
120
119
|
- data/codesxref.csv
|
121
120
|
- data/comparisons.csv
|
122
121
|
- lib/factbook-readers.rb
|
123
|
-
- lib/factbook-readers/attributes.rb
|
124
122
|
- lib/factbook-readers/builder.rb
|
125
123
|
- lib/factbook-readers/builder_item.rb
|
126
124
|
- lib/factbook-readers/builder_json.rb
|
@@ -139,26 +137,12 @@ files:
|
|
139
137
|
- lib/factbook-readers/utils_info.rb
|
140
138
|
- lib/factbook-readers/version.rb
|
141
139
|
- lib/factbook/readers.rb
|
142
|
-
- test/data/au.html
|
143
|
-
- test/data/au.yml
|
144
|
-
- test/data/be.html
|
145
|
-
- test/data/be.yml
|
146
|
-
- test/data/json/au.json
|
147
|
-
- test/data/src/ag.html
|
148
|
-
- test/data/src/au-2015-09-24.html
|
149
|
-
- test/data/src/au.html
|
150
|
-
- test/data/src/be-2015-09-24.html
|
151
|
-
- test/data/src/be.html
|
152
140
|
- test/helper.rb
|
153
|
-
- test/test_attribs.rb
|
154
|
-
- test/test_attribs_def.rb
|
155
141
|
- test/test_builder.rb
|
156
142
|
- test/test_codes.rb
|
157
143
|
- test/test_comparisons.rb
|
158
|
-
- test/test_convert.rb
|
159
144
|
- test/test_counter.rb
|
160
145
|
- test/test_fields.rb
|
161
|
-
- test/test_importer.rb
|
162
146
|
- test/test_item_builder.rb
|
163
147
|
- test/test_json.rb
|
164
148
|
- test/test_json_builder.rb
|