factbook-readers 0.0.1 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Manifest.txt +0 -16
- data/README.md +13 -14
- data/data/codes.csv +1 -1
- data/lib/factbook-readers.rb +11 -12
- data/lib/factbook-readers/builder.rb +28 -53
- data/lib/factbook-readers/builder_json.rb +9 -20
- data/lib/factbook-readers/codes.rb +3 -2
- data/lib/factbook-readers/comparisons.rb +2 -2
- data/lib/factbook-readers/page.rb +59 -85
- data/lib/factbook-readers/sanitizer.rb +13 -34
- data/lib/factbook-readers/version.rb +2 -2
- data/test/helper.rb +1 -5
- data/test/test_builder.rb +1 -6
- data/test/test_codes.rb +5 -9
- data/test/test_comparisons.rb +2 -5
- data/test/test_counter.rb +4 -6
- data/test/test_fields.rb +0 -2
- data/test/test_item_builder.rb +7 -9
- data/test/test_json.rb +1 -3
- data/test/test_json_builder.rb +1 -3
- data/test/test_normalize.rb +0 -2
- data/test/test_page.rb +2 -4
- data/test/test_sanitizer.rb +2 -5
- data/test/test_sanitizer_regex.rb +0 -2
- metadata +2 -18
- data/data/attributes.yml +0 -337
- data/lib/factbook-readers/attributes.rb +0 -74
- data/test/data/au.html +0 -579
- data/test/data/au.yml +0 -8
- data/test/data/be.html +0 -596
- data/test/data/be.yml +0 -8
- data/test/data/json/au.json +0 -892
- data/test/data/src/ag.html +0 -716
- data/test/data/src/au-2015-09-24.html +0 -2006
- data/test/data/src/au.html +0 -658
- data/test/data/src/be-2015-09-24.html +0 -2011
- data/test/data/src/be.html +0 -648
- data/test/test_attribs.rb +0 -87
- data/test/test_attribs_def.rb +0 -20
- data/test/test_convert.rb +0 -30
- data/test/test_importer.rb +0 -56
@@ -64,12 +64,6 @@ def find_country_profile( html )
|
|
64
64
|
end
|
65
65
|
|
66
66
|
|
67
|
-
## note: replace all non-breaking spaces with spaces for now
|
68
|
-
## see fr (france) in political parties section for example
|
69
|
-
html = html.gsub( " ", ' ' )
|
70
|
-
|
71
|
-
|
72
|
-
|
73
67
|
doc = Nokogiri::HTML( html )
|
74
68
|
|
75
69
|
ul = doc.css( 'ul.expandcollapse' )[0]
|
@@ -77,30 +71,8 @@ def find_country_profile( html )
|
|
77
71
|
puts ul.to_html[0..100]
|
78
72
|
|
79
73
|
|
80
|
-
|
81
|
-
## note: special case cc uses h2 instead of div block
|
82
|
-
## <h2 class="question cam_med" sectiontitle="Introduction" ccode="cc"
|
83
|
-
## style="border-bottom: 2px solid white; cursor: pointer;">
|
84
|
-
## Introduction :: <span class="region">CURACAO </span>
|
85
|
-
## </h2>
|
86
|
-
## is old format !!!!
|
87
|
-
## cc - CURACAO
|
88
|
-
## http headers says - last-modified: Wed, 14 Nov 2018 14:09:28 GMT
|
89
|
-
## page says - PAGE LAST UPDATED ON MARCH 14, 2018
|
90
|
-
## wait for new version to be generated / pushed!!!
|
91
|
-
|
92
|
-
## check for old format if h2 are present
|
93
|
-
h2s = ul.css( 'h2' )
|
94
|
-
if h2s.size > 0
|
95
|
-
puts " !! WARN: found #{h2s.size} h2(s) - assume old format - sorry - must wait for update!!!"
|
96
|
-
## return empty html string - why? why not?
|
97
|
-
return ''
|
98
|
-
end
|
99
|
-
|
100
|
-
|
101
74
|
###
|
102
75
|
## sanitize
|
103
|
-
|
104
76
|
## remove link items
|
105
77
|
## assume two <li>s are a section
|
106
78
|
|
@@ -204,12 +176,12 @@ end
|
|
204
176
|
# <span class="subfield-date" aria-label="Date of information: 2018">(2018)</span>
|
205
177
|
#
|
206
178
|
# remove aria labels
|
207
|
-
|
179
|
+
ARIA_ATTR_RE = /\s*
|
208
180
|
aria-label=('|").+?\1 ## note: use non-greedy match e.g. .+?
|
209
181
|
/xim ## do NOT allow multi-line - why? why not?
|
210
182
|
|
211
183
|
## find double breaks e.g. <br><br>
|
212
|
-
|
184
|
+
BR_BR_RE = /(<br> \s* <br>)
|
213
185
|
/xim ## do NOT allow multi-line - why? why not?
|
214
186
|
|
215
187
|
|
@@ -252,6 +224,12 @@ def sanitize_data( el, title: )
|
|
252
224
|
## note: keep container div!! just replace inner html!!!
|
253
225
|
## note: right strip all trailing spaces/newlines for now
|
254
226
|
## plus add back a single one for pretty printing
|
227
|
+
|
228
|
+
## note: replace all non-breaking spaces with spaces for now
|
229
|
+
## see fr (france) in political parties section for example
|
230
|
+
## todo/check/fix: check if we need to use unicode char!! and NOT html entity
|
231
|
+
inner_html = inner_html.gsub( " ", ' ' )
|
232
|
+
|
255
233
|
el.inner_html = inner_html.rstrip + "\n"
|
256
234
|
|
257
235
|
# finally - convert back to html (string)
|
@@ -259,14 +237,14 @@ def sanitize_data( el, title: )
|
|
259
237
|
|
260
238
|
|
261
239
|
|
262
|
-
html = html.gsub(
|
240
|
+
html = html.gsub( ARIA_ATTR_RE ) do |m|
|
263
241
|
## do not report / keep silent for now
|
264
242
|
## puts "in >#{title}< remove aria-label attr:"
|
265
243
|
## puts "#{m}"
|
266
244
|
''
|
267
245
|
end
|
268
246
|
|
269
|
-
html = html.gsub(
|
247
|
+
html = html.gsub( BR_BR_RE ) do |m|
|
270
248
|
puts "in >#{title}< squish two <br>s into one:"
|
271
249
|
puts "#{m}"
|
272
250
|
'<br>'
|
@@ -280,11 +258,12 @@ def sanitize_data( el, title: )
|
|
280
258
|
|
281
259
|
## cleanup/remove ++ before subfield e.g.
|
282
260
|
## of: ++ => of: or such
|
261
|
+
##
|
262
|
+
## todo/fix: add negative lookahead e.g. not another + to be more specific!!
|
283
263
|
html = html.gsub( %r{
|
284
264
|
(?<=([a-z]:)|(:</span>)) # note: use zero-length positive lookbehind
|
285
265
|
\s+
|
286
|
-
\+{2}
|
287
|
-
\s+}xim ) do |m|
|
266
|
+
\+{2}}xim ) do |m|
|
288
267
|
puts "in >#{title} remove ++ before <field>: marker:"
|
289
268
|
puts "#{m}"
|
290
269
|
' '
|
data/test/helper.rb
CHANGED
data/test/test_builder.rb
CHANGED
@@ -1,5 +1,3 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
1
|
###
|
4
2
|
# to run use
|
5
3
|
# ruby -I ./lib -I ./test test/test_builder.rb
|
@@ -8,14 +6,11 @@
|
|
8
6
|
require 'helper'
|
9
7
|
|
10
8
|
|
11
|
-
##
|
12
|
-
## use/fix: ASCII-8BIT (e.g.keep as is)
|
13
|
-
|
14
9
|
|
15
10
|
class TestBuilder < MiniTest::Test
|
16
11
|
|
17
12
|
def test_build
|
18
|
-
|
13
|
+
|
19
14
|
['au','be'].each do |code|
|
20
15
|
## use/fix: ASCII-8BIT (e.g.keep as is) -???
|
21
16
|
## fix/todo: use ASCII8BIT/binary reader ??
|
data/test/test_codes.rb
CHANGED
@@ -1,5 +1,3 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
1
|
###
|
4
2
|
# to run use
|
5
3
|
# ruby -I ./lib -I ./test test/test_codes.rb
|
@@ -12,9 +10,7 @@ class TestCodes < MiniTest::Test
|
|
12
10
|
|
13
11
|
|
14
12
|
def test_codes
|
15
|
-
|
16
|
-
assert_equal 261, Factbook::CODES.size
|
17
|
-
assert_equal 261, Factbook.codes.size
|
13
|
+
assert_equal 261, Factbook.codes.size
|
18
14
|
assert_equal 261, Factbook.codes.to_a.size
|
19
15
|
|
20
16
|
|
@@ -27,7 +23,7 @@ class TestCodes < MiniTest::Test
|
|
27
23
|
|
28
24
|
assert_equal 8, Factbook.codes.dependencies_us.size
|
29
25
|
|
30
|
-
|
26
|
+
|
31
27
|
assert_equal 55, Factbook.codes.europe.size
|
32
28
|
assert_equal 9, Factbook.codes.south_asia.size
|
33
29
|
assert_equal 6, Factbook.codes.central_asia.size
|
@@ -43,11 +39,11 @@ class TestCodes < MiniTest::Test
|
|
43
39
|
assert_equal 1, Factbook.codes.region('World').size
|
44
40
|
|
45
41
|
assert_equal 45, Factbook.codes.countries.europe.size
|
46
|
-
|
42
|
+
|
47
43
|
assert_equal Factbook.codes.category('Oceans').size, Factbook.codes.region('Oceans').size
|
48
44
|
assert_equal Factbook.codes.category('World').size, Factbook.codes.region('World').size
|
49
45
|
|
50
|
-
|
46
|
+
|
51
47
|
assert_equal 261, Factbook.codes.countries.size +
|
52
48
|
Factbook.codes.others.size +
|
53
49
|
Factbook.codes.dependencies.size +
|
@@ -68,7 +64,7 @@ class TestCodes < MiniTest::Test
|
|
68
64
|
Factbook.codes.antartica.size +
|
69
65
|
Factbook.codes.region('Oceans').size +
|
70
66
|
Factbook.codes.region('World').size
|
71
|
-
|
67
|
+
|
72
68
|
end
|
73
69
|
|
74
70
|
end # class TestCodes
|
data/test/test_comparisons.rb
CHANGED
@@ -1,5 +1,3 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
1
|
###
|
4
2
|
# to run use
|
5
3
|
# ruby -I ./lib -I ./test test/test_comparisons.rb
|
@@ -11,9 +9,8 @@ require 'helper'
|
|
11
9
|
class TestComparisons < MiniTest::Test
|
12
10
|
|
13
11
|
def test_comparisons
|
14
|
-
assert_equal 74, Factbook
|
15
|
-
assert_equal 74, Factbook.comparisons.size
|
16
|
-
assert_equal 74, Factbook.comparisons.to_a.size
|
12
|
+
assert_equal 74, Factbook.comparisons.size
|
13
|
+
assert_equal 74, Factbook.comparisons.to_a.size
|
17
14
|
end
|
18
15
|
|
19
16
|
end # class TestComparisons
|
data/test/test_counter.rb
CHANGED
@@ -1,5 +1,3 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
1
|
###
|
4
2
|
# to run use
|
5
3
|
# ruby -I ./lib -I ./test test/test_counter.rb
|
@@ -16,16 +14,16 @@ class TestCounter < MiniTest::Test
|
|
16
14
|
|
17
15
|
def test_counter
|
18
16
|
c = Factbook::Counter.new
|
19
|
-
|
17
|
+
|
20
18
|
codes = %w(au be)
|
21
19
|
codes.each do |code|
|
22
20
|
c.count( read_test_page( code )) # use builtin test page (do NOT fetch via internet)
|
23
21
|
end
|
24
|
-
|
22
|
+
|
25
23
|
h = c.data
|
26
24
|
pp h
|
27
|
-
|
28
|
-
assert true ## assume everything ok if we get here
|
25
|
+
|
26
|
+
assert true ## assume everything ok if we get here
|
29
27
|
end
|
30
28
|
|
31
29
|
end # class TestCounter
|
data/test/test_fields.rb
CHANGED
data/test/test_item_builder.rb
CHANGED
@@ -1,5 +1,3 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
1
|
###
|
4
2
|
# to run use
|
5
3
|
# ruby -I ./lib -I ./test test/test_item_builder.rb
|
@@ -15,10 +13,10 @@ class TestItemBuilder < MiniTest::Test
|
|
15
13
|
html =<<EOS
|
16
14
|
<div class=category_data>Central Europe, north of Italy and Slovenia</div>
|
17
15
|
EOS
|
18
|
-
|
16
|
+
|
19
17
|
b = Factbook::ItemBuilder.new( html, 'Location' )
|
20
18
|
b.read
|
21
|
-
|
19
|
+
|
22
20
|
assert true ## assume everthing ok
|
23
21
|
end
|
24
22
|
|
@@ -31,7 +29,7 @@ EOS
|
|
31
29
|
|
32
30
|
b = Factbook::ItemBuilder.new( html, 'Area' )
|
33
31
|
b.read
|
34
|
-
|
32
|
+
|
35
33
|
assert true ## assume everthing ok
|
36
34
|
end
|
37
35
|
|
@@ -45,7 +43,7 @@ EOS
|
|
45
43
|
|
46
44
|
b = Factbook::ItemBuilder.new( html, 'Land use' )
|
47
45
|
b.read
|
48
|
-
|
46
|
+
|
49
47
|
assert true ## assume everthing ok
|
50
48
|
end
|
51
49
|
|
@@ -57,7 +55,7 @@ EOS
|
|
57
55
|
|
58
56
|
b = Factbook::ItemBuilder.new( html, 'Contraceptive Prevalence Rate' )
|
59
57
|
b.read
|
60
|
-
|
58
|
+
|
61
59
|
assert true ## assume everthing ok
|
62
60
|
end
|
63
61
|
|
@@ -75,7 +73,7 @@ EOS
|
|
75
73
|
|
76
74
|
b = Factbook::ItemBuilder.new( html, 'Drinking Water Source' )
|
77
75
|
b.read
|
78
|
-
|
76
|
+
|
79
77
|
assert true ## assume everthing ok
|
80
78
|
end
|
81
79
|
|
@@ -91,7 +89,7 @@ EOS
|
|
91
89
|
|
92
90
|
b = Factbook::ItemBuilder.new( html, 'Political pressure groups and leaders' )
|
93
91
|
b.read
|
94
|
-
|
92
|
+
|
95
93
|
assert true ## assume everthing ok
|
96
94
|
end
|
97
95
|
|
data/test/test_json.rb
CHANGED
@@ -1,5 +1,3 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
1
|
###
|
4
2
|
# to run use
|
5
3
|
# ruby -I ./lib -I ./test test/test_json.rb
|
@@ -35,7 +33,7 @@ class TestJson < MiniTest::Test
|
|
35
33
|
|
36
34
|
### save to json
|
37
35
|
puts "saving a copy to #{code}.json for debugging"
|
38
|
-
File.open( "tmp/#{code}.json", 'w' ) do |f|
|
36
|
+
File.open( "tmp/#{code}.json", 'w:utf-8' ) do |f|
|
39
37
|
f.write JSON.pretty_generate( h )
|
40
38
|
## f.write page.to_json
|
41
39
|
end
|
data/test/test_json_builder.rb
CHANGED
@@ -1,5 +1,3 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
1
|
###
|
4
2
|
# to run use
|
5
3
|
# ruby -I ./lib -I ./test test/test_json_builder.rb
|
@@ -13,7 +11,7 @@ class TestJsonBuilder < MiniTest::Test
|
|
13
11
|
def test_read
|
14
12
|
code = 'au'
|
15
13
|
b = Factbook::JsonBuilder.from_file( "#{Factbook.root}/test/data/json/#{code}.json" )
|
16
|
-
|
14
|
+
|
17
15
|
assert_equal 10, b.sects.size
|
18
16
|
assert_equal 1, b.sects[0].subsects.size ## e.g. Introduction/Background
|
19
17
|
assert_equal 'Central Europe, north of Italy and Slovenia', b.json['Geography']['Location']['text']
|
data/test/test_normalize.rb
CHANGED
data/test/test_page.rb
CHANGED
@@ -1,5 +1,3 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
1
|
###
|
4
2
|
# to run use
|
5
3
|
# ruby -I ./lib -I ./test test/test_page.rb
|
@@ -22,14 +20,14 @@ class TestPage < MiniTest::Test
|
|
22
20
|
# [ 'ls', 9 ],
|
23
21
|
# [ 'vt', 8 ],
|
24
22
|
]
|
25
|
-
|
23
|
+
|
26
24
|
pages.each do |rec|
|
27
25
|
code = rec[0]
|
28
26
|
sects_size = rec[1]
|
29
27
|
|
30
28
|
html = File.read( "#{Factbook.root}/test/data/src/#{code}.html" )
|
31
29
|
page = Factbook::Page.new( code, html: html )
|
32
|
-
|
30
|
+
|
33
31
|
assert_equal sects_size, page.sects.size
|
34
32
|
end
|
35
33
|
end
|
data/test/test_sanitizer.rb
CHANGED
@@ -1,5 +1,3 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
1
|
###
|
4
2
|
# to run use
|
5
3
|
# ruby -I ./lib -I ./test test/test_sanitizer.rb
|
@@ -18,10 +16,9 @@ class TestSanitizer < MiniTest::Test
|
|
18
16
|
## ['au'].each do |cnty|
|
19
17
|
['au','ag','be'].each do |cnty|
|
20
18
|
|
21
|
-
|
22
|
-
html_ascii = File.read( "#{Factbook.root}/test/data/src/#{cnty}.html" ) ## fix/todo: use ASCII8BIT/binary reader ??
|
19
|
+
html_original = File.read( "#{Factbook.root}/test/data/src/#{cnty}.html", 'r:utf-8' ) { |f| r.read }
|
23
20
|
|
24
|
-
html, info, errors = Factbook::Sanitizer.new.sanitize(
|
21
|
+
html, info, errors = Factbook::Sanitizer.new.sanitize( html_original )
|
25
22
|
|
26
23
|
File.open( "./tmp/#{cnty}.profile.html", 'w' ) do |f|
|
27
24
|
f.write "** info:\n"
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: factbook-readers
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0
|
4
|
+
version: 1.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Gerald Bauer
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-11-
|
11
|
+
date: 2020-11-27 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: logutils
|
@@ -114,13 +114,11 @@ files:
|
|
114
114
|
- Manifest.txt
|
115
115
|
- README.md
|
116
116
|
- Rakefile
|
117
|
-
- data/attributes.yml
|
118
117
|
- data/categories.csv
|
119
118
|
- data/codes.csv
|
120
119
|
- data/codesxref.csv
|
121
120
|
- data/comparisons.csv
|
122
121
|
- lib/factbook-readers.rb
|
123
|
-
- lib/factbook-readers/attributes.rb
|
124
122
|
- lib/factbook-readers/builder.rb
|
125
123
|
- lib/factbook-readers/builder_item.rb
|
126
124
|
- lib/factbook-readers/builder_json.rb
|
@@ -139,26 +137,12 @@ files:
|
|
139
137
|
- lib/factbook-readers/utils_info.rb
|
140
138
|
- lib/factbook-readers/version.rb
|
141
139
|
- lib/factbook/readers.rb
|
142
|
-
- test/data/au.html
|
143
|
-
- test/data/au.yml
|
144
|
-
- test/data/be.html
|
145
|
-
- test/data/be.yml
|
146
|
-
- test/data/json/au.json
|
147
|
-
- test/data/src/ag.html
|
148
|
-
- test/data/src/au-2015-09-24.html
|
149
|
-
- test/data/src/au.html
|
150
|
-
- test/data/src/be-2015-09-24.html
|
151
|
-
- test/data/src/be.html
|
152
140
|
- test/helper.rb
|
153
|
-
- test/test_attribs.rb
|
154
|
-
- test/test_attribs_def.rb
|
155
141
|
- test/test_builder.rb
|
156
142
|
- test/test_codes.rb
|
157
143
|
- test/test_comparisons.rb
|
158
|
-
- test/test_convert.rb
|
159
144
|
- test/test_counter.rb
|
160
145
|
- test/test_fields.rb
|
161
|
-
- test/test_importer.rb
|
162
146
|
- test/test_item_builder.rb
|
163
147
|
- test/test_json.rb
|
164
148
|
- test/test_json_builder.rb
|