factbook-readers 0.0.1 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -64,12 +64,6 @@ def find_country_profile( html )
64
64
  end
65
65
 
66
66
 
67
- ## note: replace all non-breaking spaces with spaces for now
68
- ## see fr (france) in political parties section for example
69
- html = html.gsub( " ", ' ' )
70
-
71
-
72
-
73
67
  doc = Nokogiri::HTML( html )
74
68
 
75
69
  ul = doc.css( 'ul.expandcollapse' )[0]
@@ -77,30 +71,8 @@ def find_country_profile( html )
77
71
  puts ul.to_html[0..100]
78
72
 
79
73
 
80
-
81
- ## note: special case cc uses h2 instead of div block
82
- ## <h2 class="question cam_med" sectiontitle="Introduction" ccode="cc"
83
- ## style="border-bottom: 2px solid white; cursor: pointer;">
84
- ## Introduction :: <span class="region">CURACAO </span>
85
- ## </h2>
86
- ## is old format !!!!
87
- ## cc - CURACAO
88
- ## http headers says - last-modified: Wed, 14 Nov 2018 14:09:28 GMT
89
- ## page says - PAGE LAST UPDATED ON MARCH 14, 2018
90
- ## wait for new version to be generated / pushed!!!
91
-
92
- ## check for old format if h2 are present
93
- h2s = ul.css( 'h2' )
94
- if h2s.size > 0
95
- puts " !! WARN: found #{h2s.size} h2(s) - assume old format - sorry - must wait for update!!!"
96
- ## return empty html string - why? why not?
97
- return ''
98
- end
99
-
100
-
101
74
  ###
102
75
  ## sanitize
103
-
104
76
  ## remove link items
105
77
  ## assume two <li>s are a section
106
78
 
@@ -204,12 +176,12 @@ end
204
176
  # <span class="subfield-date" aria-label="Date of information: 2018">(2018)</span>
205
177
  #
206
178
  # remove aria labels
207
- ARIA_ATTR_REGEX = /\s*
179
+ ARIA_ATTR_RE = /\s*
208
180
  aria-label=('|").+?\1 ## note: use non-greedy match e.g. .+?
209
181
  /xim ## do NOT allow multi-line - why? why not?
210
182
 
211
183
  ## find double breaks e.g. <br><br>
212
- BR_BR_REGEX = /(<br> \s* <br>)
184
+ BR_BR_RE = /(<br> \s* <br>)
213
185
  /xim ## do NOT allow multi-line - why? why not?
214
186
 
215
187
 
@@ -252,6 +224,12 @@ def sanitize_data( el, title: )
252
224
  ## note: keep container div!! just replace inner html!!!
253
225
  ## note: right strip all trailing spaces/newlines for now
254
226
  ## plus add back a single one for pretty printing
227
+
228
+ ## note: replace all non-breaking spaces with spaces for now
229
+ ## see fr (france) in political parties section for example
230
+ ## todo/check/fix: check if we need to use unicode char!! and NOT html entity
231
+ inner_html = inner_html.gsub( "&nbsp;", ' ' )
232
+
255
233
  el.inner_html = inner_html.rstrip + "\n"
256
234
 
257
235
  # finally - convert back to html (string)
@@ -259,14 +237,14 @@ def sanitize_data( el, title: )
259
237
 
260
238
 
261
239
 
262
- html = html.gsub( ARIA_ATTR_REGEX ) do |m|
240
+ html = html.gsub( ARIA_ATTR_RE ) do |m|
263
241
  ## do not report / keep silent for now
264
242
  ## puts "in >#{title}< remove aria-label attr:"
265
243
  ## puts "#{m}"
266
244
  ''
267
245
  end
268
246
 
269
- html = html.gsub( BR_BR_REGEX ) do |m|
247
+ html = html.gsub( BR_BR_RE ) do |m|
270
248
  puts "in >#{title}< squish two <br>s into one:"
271
249
  puts "#{m}"
272
250
  '<br>'
@@ -280,11 +258,12 @@ def sanitize_data( el, title: )
280
258
 
281
259
  ## cleanup/remove ++ before subfield e.g.
282
260
  ## of: ++ => of: or such
261
+ ##
262
+ ## todo/fix: add negative lookahead e.g. not another + to be more specific!!
283
263
  html = html.gsub( %r{
284
264
  (?<=([a-z]:)|(:</span>)) # note: use zero-length positive lookbehind
285
265
  \s+
286
- \+{2}
287
- \s+}xim ) do |m|
266
+ \+{2}}xim ) do |m|
288
267
  puts "in >#{title} remove ++ before <field>: marker:"
289
268
  puts "#{m}"
290
269
  ' '
@@ -2,9 +2,9 @@
2
2
  module Factbook
3
3
  module Module
4
4
  module Readers
5
- MAJOR = 0
5
+ MAJOR = 1
6
6
  MINOR = 0
7
- PATCH = 1
7
+ PATCH = 0
8
8
  VERSION = [MAJOR,MINOR,PATCH].join('.')
9
9
 
10
10
  def self.version
@@ -1,11 +1,7 @@
1
- # encoding: utf-8
2
-
3
- ## $:.unshift(File.dirname(__FILE__))
4
-
5
1
  ## minitest setup
6
2
  require 'minitest/autorun'
7
3
 
8
4
 
9
5
  ## our own code
10
- require 'factbook'
6
+ require 'factbook/readers'
11
7
 
@@ -1,5 +1,3 @@
1
- # encoding: utf-8
2
-
3
1
  ###
4
2
  # to run use
5
3
  # ruby -I ./lib -I ./test test/test_builder.rb
@@ -8,14 +6,11 @@
8
6
  require 'helper'
9
7
 
10
8
 
11
- ##
12
- ## use/fix: ASCII-8BIT (e.g.keep as is)
13
-
14
9
 
15
10
  class TestBuilder < MiniTest::Test
16
11
 
17
12
  def test_build
18
-
13
+
19
14
  ['au','be'].each do |code|
20
15
  ## use/fix: ASCII-8BIT (e.g.keep as is) -???
21
16
  ## fix/todo: use ASCII8BIT/binary reader ??
@@ -1,5 +1,3 @@
1
- # encoding: utf-8
2
-
3
1
  ###
4
2
  # to run use
5
3
  # ruby -I ./lib -I ./test test/test_codes.rb
@@ -12,9 +10,7 @@ class TestCodes < MiniTest::Test
12
10
 
13
11
 
14
12
  def test_codes
15
-
16
- assert_equal 261, Factbook::CODES.size
17
- assert_equal 261, Factbook.codes.size
13
+ assert_equal 261, Factbook.codes.size
18
14
  assert_equal 261, Factbook.codes.to_a.size
19
15
 
20
16
 
@@ -27,7 +23,7 @@ class TestCodes < MiniTest::Test
27
23
 
28
24
  assert_equal 8, Factbook.codes.dependencies_us.size
29
25
 
30
-
26
+
31
27
  assert_equal 55, Factbook.codes.europe.size
32
28
  assert_equal 9, Factbook.codes.south_asia.size
33
29
  assert_equal 6, Factbook.codes.central_asia.size
@@ -43,11 +39,11 @@ class TestCodes < MiniTest::Test
43
39
  assert_equal 1, Factbook.codes.region('World').size
44
40
 
45
41
  assert_equal 45, Factbook.codes.countries.europe.size
46
-
42
+
47
43
  assert_equal Factbook.codes.category('Oceans').size, Factbook.codes.region('Oceans').size
48
44
  assert_equal Factbook.codes.category('World').size, Factbook.codes.region('World').size
49
45
 
50
-
46
+
51
47
  assert_equal 261, Factbook.codes.countries.size +
52
48
  Factbook.codes.others.size +
53
49
  Factbook.codes.dependencies.size +
@@ -68,7 +64,7 @@ class TestCodes < MiniTest::Test
68
64
  Factbook.codes.antartica.size +
69
65
  Factbook.codes.region('Oceans').size +
70
66
  Factbook.codes.region('World').size
71
-
67
+
72
68
  end
73
69
 
74
70
  end # class TestCodes
@@ -1,5 +1,3 @@
1
- # encoding: utf-8
2
-
3
1
  ###
4
2
  # to run use
5
3
  # ruby -I ./lib -I ./test test/test_comparisons.rb
@@ -11,9 +9,8 @@ require 'helper'
11
9
  class TestComparisons < MiniTest::Test
12
10
 
13
11
  def test_comparisons
14
- assert_equal 74, Factbook::COMPARISONS.size
15
- assert_equal 74, Factbook.comparisons.size
16
- assert_equal 74, Factbook.comparisons.to_a.size
12
+ assert_equal 74, Factbook.comparisons.size
13
+ assert_equal 74, Factbook.comparisons.to_a.size
17
14
  end
18
15
 
19
16
  end # class TestComparisons
@@ -1,5 +1,3 @@
1
- # encoding: utf-8
2
-
3
1
  ###
4
2
  # to run use
5
3
  # ruby -I ./lib -I ./test test/test_counter.rb
@@ -16,16 +14,16 @@ class TestCounter < MiniTest::Test
16
14
 
17
15
  def test_counter
18
16
  c = Factbook::Counter.new
19
-
17
+
20
18
  codes = %w(au be)
21
19
  codes.each do |code|
22
20
  c.count( read_test_page( code )) # use builtin test page (do NOT fetch via internet)
23
21
  end
24
-
22
+
25
23
  h = c.data
26
24
  pp h
27
-
28
- assert true ## assume everything ok if we get here
25
+
26
+ assert true ## assume everything ok if we get here
29
27
  end
30
28
 
31
29
  end # class TestCounter
@@ -1,5 +1,3 @@
1
- # encoding: utf-8
2
-
3
1
  ###
4
2
  # to run use
5
3
  # ruby -I ./lib -I ./test test/test_fields.rb
@@ -1,5 +1,3 @@
1
- # encoding: utf-8
2
-
3
1
  ###
4
2
  # to run use
5
3
  # ruby -I ./lib -I ./test test/test_item_builder.rb
@@ -15,10 +13,10 @@ class TestItemBuilder < MiniTest::Test
15
13
  html =<<EOS
16
14
  <div class=category_data>Central Europe, north of Italy and Slovenia</div>
17
15
  EOS
18
-
16
+
19
17
  b = Factbook::ItemBuilder.new( html, 'Location' )
20
18
  b.read
21
-
19
+
22
20
  assert true ## assume everthing ok
23
21
  end
24
22
 
@@ -31,7 +29,7 @@ EOS
31
29
 
32
30
  b = Factbook::ItemBuilder.new( html, 'Area' )
33
31
  b.read
34
-
32
+
35
33
  assert true ## assume everthing ok
36
34
  end
37
35
 
@@ -45,7 +43,7 @@ EOS
45
43
 
46
44
  b = Factbook::ItemBuilder.new( html, 'Land use' )
47
45
  b.read
48
-
46
+
49
47
  assert true ## assume everthing ok
50
48
  end
51
49
 
@@ -57,7 +55,7 @@ EOS
57
55
 
58
56
  b = Factbook::ItemBuilder.new( html, 'Contraceptive Prevalence Rate' )
59
57
  b.read
60
-
58
+
61
59
  assert true ## assume everthing ok
62
60
  end
63
61
 
@@ -75,7 +73,7 @@ EOS
75
73
 
76
74
  b = Factbook::ItemBuilder.new( html, 'Drinking Water Source' )
77
75
  b.read
78
-
76
+
79
77
  assert true ## assume everthing ok
80
78
  end
81
79
 
@@ -91,7 +89,7 @@ EOS
91
89
 
92
90
  b = Factbook::ItemBuilder.new( html, 'Political pressure groups and leaders' )
93
91
  b.read
94
-
92
+
95
93
  assert true ## assume everthing ok
96
94
  end
97
95
 
@@ -1,5 +1,3 @@
1
- # encoding: utf-8
2
-
3
1
  ###
4
2
  # to run use
5
3
  # ruby -I ./lib -I ./test test/test_json.rb
@@ -35,7 +33,7 @@ class TestJson < MiniTest::Test
35
33
 
36
34
  ### save to json
37
35
  puts "saving a copy to #{code}.json for debugging"
38
- File.open( "tmp/#{code}.json", 'w' ) do |f|
36
+ File.open( "tmp/#{code}.json", 'w:utf-8' ) do |f|
39
37
  f.write JSON.pretty_generate( h )
40
38
  ## f.write page.to_json
41
39
  end
@@ -1,5 +1,3 @@
1
- # encoding: utf-8
2
-
3
1
  ###
4
2
  # to run use
5
3
  # ruby -I ./lib -I ./test test/test_json_builder.rb
@@ -13,7 +11,7 @@ class TestJsonBuilder < MiniTest::Test
13
11
  def test_read
14
12
  code = 'au'
15
13
  b = Factbook::JsonBuilder.from_file( "#{Factbook.root}/test/data/json/#{code}.json" )
16
-
14
+
17
15
  assert_equal 10, b.sects.size
18
16
  assert_equal 1, b.sects[0].subsects.size ## e.g. Introduction/Background
19
17
  assert_equal 'Central Europe, north of Italy and Slovenia', b.json['Geography']['Location']['text']
@@ -1,5 +1,3 @@
1
- # encoding: utf-8
2
-
3
1
  ###
4
2
  # to run use
5
3
  # ruby -I ./lib -I ./test test/test_normalize.rb
@@ -1,5 +1,3 @@
1
- # encoding: utf-8
2
-
3
1
  ###
4
2
  # to run use
5
3
  # ruby -I ./lib -I ./test test/test_page.rb
@@ -22,14 +20,14 @@ class TestPage < MiniTest::Test
22
20
  # [ 'ls', 9 ],
23
21
  # [ 'vt', 8 ],
24
22
  ]
25
-
23
+
26
24
  pages.each do |rec|
27
25
  code = rec[0]
28
26
  sects_size = rec[1]
29
27
 
30
28
  html = File.read( "#{Factbook.root}/test/data/src/#{code}.html" )
31
29
  page = Factbook::Page.new( code, html: html )
32
-
30
+
33
31
  assert_equal sects_size, page.sects.size
34
32
  end
35
33
  end
@@ -1,5 +1,3 @@
1
- # encoding: utf-8
2
-
3
1
  ###
4
2
  # to run use
5
3
  # ruby -I ./lib -I ./test test/test_sanitizer.rb
@@ -18,10 +16,9 @@ class TestSanitizer < MiniTest::Test
18
16
  ## ['au'].each do |cnty|
19
17
  ['au','ag','be'].each do |cnty|
20
18
 
21
- ## use/fix: ASCII-8BIT (e.g.keep as is) -???
22
- html_ascii = File.read( "#{Factbook.root}/test/data/src/#{cnty}.html" ) ## fix/todo: use ASCII8BIT/binary reader ??
19
+ html_original = File.read( "#{Factbook.root}/test/data/src/#{cnty}.html", 'r:utf-8' ) { |f| r.read }
23
20
 
24
- html, info, errors = Factbook::Sanitizer.new.sanitize( html_ascii )
21
+ html, info, errors = Factbook::Sanitizer.new.sanitize( html_original )
25
22
 
26
23
  File.open( "./tmp/#{cnty}.profile.html", 'w' ) do |f|
27
24
  f.write "** info:\n"
@@ -1,5 +1,3 @@
1
- # encoding: utf-8
2
-
3
1
  ###
4
2
  # to run use
5
3
  # ruby -I ./lib -I ./test test/test_sanitizer_regex.rb
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: factbook-readers
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 1.0.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Gerald Bauer
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-11-26 00:00:00.000000000 Z
11
+ date: 2020-11-27 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: logutils
@@ -114,13 +114,11 @@ files:
114
114
  - Manifest.txt
115
115
  - README.md
116
116
  - Rakefile
117
- - data/attributes.yml
118
117
  - data/categories.csv
119
118
  - data/codes.csv
120
119
  - data/codesxref.csv
121
120
  - data/comparisons.csv
122
121
  - lib/factbook-readers.rb
123
- - lib/factbook-readers/attributes.rb
124
122
  - lib/factbook-readers/builder.rb
125
123
  - lib/factbook-readers/builder_item.rb
126
124
  - lib/factbook-readers/builder_json.rb
@@ -139,26 +137,12 @@ files:
139
137
  - lib/factbook-readers/utils_info.rb
140
138
  - lib/factbook-readers/version.rb
141
139
  - lib/factbook/readers.rb
142
- - test/data/au.html
143
- - test/data/au.yml
144
- - test/data/be.html
145
- - test/data/be.yml
146
- - test/data/json/au.json
147
- - test/data/src/ag.html
148
- - test/data/src/au-2015-09-24.html
149
- - test/data/src/au.html
150
- - test/data/src/be-2015-09-24.html
151
- - test/data/src/be.html
152
140
  - test/helper.rb
153
- - test/test_attribs.rb
154
- - test/test_attribs_def.rb
155
141
  - test/test_builder.rb
156
142
  - test/test_codes.rb
157
143
  - test/test_comparisons.rb
158
- - test/test_convert.rb
159
144
  - test/test_counter.rb
160
145
  - test/test_fields.rb
161
- - test/test_importer.rb
162
146
  - test/test_item_builder.rb
163
147
  - test/test_json.rb
164
148
  - test/test_json_builder.rb