factbook-readers 0.0.1 → 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -64,12 +64,6 @@ def find_country_profile( html )
64
64
  end
65
65
 
66
66
 
67
- ## note: replace all non-breaking spaces with spaces for now
68
- ## see fr (france) in political parties section for example
69
- html = html.gsub( " ", ' ' )
70
-
71
-
72
-
73
67
  doc = Nokogiri::HTML( html )
74
68
 
75
69
  ul = doc.css( 'ul.expandcollapse' )[0]
@@ -77,30 +71,8 @@ def find_country_profile( html )
77
71
  puts ul.to_html[0..100]
78
72
 
79
73
 
80
-
81
- ## note: special case cc uses h2 instead of div block
82
- ## <h2 class="question cam_med" sectiontitle="Introduction" ccode="cc"
83
- ## style="border-bottom: 2px solid white; cursor: pointer;">
84
- ## Introduction :: <span class="region">CURACAO </span>
85
- ## </h2>
86
- ## is old format !!!!
87
- ## cc - CURACAO
88
- ## http headers says - last-modified: Wed, 14 Nov 2018 14:09:28 GMT
89
- ## page says - PAGE LAST UPDATED ON MARCH 14, 2018
90
- ## wait for new version to be generated / pushed!!!
91
-
92
- ## check for old format if h2 are present
93
- h2s = ul.css( 'h2' )
94
- if h2s.size > 0
95
- puts " !! WARN: found #{h2s.size} h2(s) - assume old format - sorry - must wait for update!!!"
96
- ## return empty html string - why? why not?
97
- return ''
98
- end
99
-
100
-
101
74
  ###
102
75
  ## sanitize
103
-
104
76
  ## remove link items
105
77
  ## assume two <li>s are a section
106
78
 
@@ -204,12 +176,12 @@ end
204
176
  # <span class="subfield-date" aria-label="Date of information: 2018">(2018)</span>
205
177
  #
206
178
  # remove aria labels
207
- ARIA_ATTR_REGEX = /\s*
179
+ ARIA_ATTR_RE = /\s*
208
180
  aria-label=('|").+?\1 ## note: use non-greedy match e.g. .+?
209
181
  /xim ## do NOT allow multi-line - why? why not?
210
182
 
211
183
  ## find double breaks e.g. <br><br>
212
- BR_BR_REGEX = /(<br> \s* <br>)
184
+ BR_BR_RE = /(<br> \s* <br>)
213
185
  /xim ## do NOT allow multi-line - why? why not?
214
186
 
215
187
 
@@ -252,6 +224,12 @@ def sanitize_data( el, title: )
252
224
  ## note: keep container div!! just replace inner html!!!
253
225
  ## note: right strip all trailing spaces/newlines for now
254
226
  ## plus add back a single one for pretty printing
227
+
228
+ ## note: replace all non-breaking spaces with spaces for now
229
+ ## see fr (france) in political parties section for example
230
+ ## todo/check/fix: check if we need to use unicode char!! and NOT html entity
231
+ inner_html = inner_html.gsub( "&nbsp;", ' ' )
232
+
255
233
  el.inner_html = inner_html.rstrip + "\n"
256
234
 
257
235
  # finally - convert back to html (string)
@@ -259,14 +237,14 @@ def sanitize_data( el, title: )
259
237
 
260
238
 
261
239
 
262
- html = html.gsub( ARIA_ATTR_REGEX ) do |m|
240
+ html = html.gsub( ARIA_ATTR_RE ) do |m|
263
241
  ## do not report / keep silent for now
264
242
  ## puts "in >#{title}< remove aria-label attr:"
265
243
  ## puts "#{m}"
266
244
  ''
267
245
  end
268
246
 
269
- html = html.gsub( BR_BR_REGEX ) do |m|
247
+ html = html.gsub( BR_BR_RE ) do |m|
270
248
  puts "in >#{title}< squish two <br>s into one:"
271
249
  puts "#{m}"
272
250
  '<br>'
@@ -280,11 +258,12 @@ def sanitize_data( el, title: )
280
258
 
281
259
  ## cleanup/remove ++ before subfield e.g.
282
260
  ## of: ++ => of: or such
261
+ ##
262
+ ## todo/fix: add negative lookahead e.g. not another + to be more specific!!
283
263
  html = html.gsub( %r{
284
264
  (?<=([a-z]:)|(:</span>)) # note: use zero-length positive lookbehind
285
265
  \s+
286
- \+{2}
287
- \s+}xim ) do |m|
266
+ \+{2}}xim ) do |m|
288
267
  puts "in >#{title} remove ++ before <field>: marker:"
289
268
  puts "#{m}"
290
269
  ' '
@@ -2,9 +2,9 @@
2
2
  module Factbook
3
3
  module Module
4
4
  module Readers
5
- MAJOR = 0
5
+ MAJOR = 1
6
6
  MINOR = 0
7
- PATCH = 1
7
+ PATCH = 0
8
8
  VERSION = [MAJOR,MINOR,PATCH].join('.')
9
9
 
10
10
  def self.version
@@ -1,11 +1,7 @@
1
- # encoding: utf-8
2
-
3
- ## $:.unshift(File.dirname(__FILE__))
4
-
5
1
  ## minitest setup
6
2
  require 'minitest/autorun'
7
3
 
8
4
 
9
5
  ## our own code
10
- require 'factbook'
6
+ require 'factbook/readers'
11
7
 
@@ -1,5 +1,3 @@
1
- # encoding: utf-8
2
-
3
1
  ###
4
2
  # to run use
5
3
  # ruby -I ./lib -I ./test test/test_builder.rb
@@ -8,14 +6,11 @@
8
6
  require 'helper'
9
7
 
10
8
 
11
- ##
12
- ## use/fix: ASCII-8BIT (e.g.keep as is)
13
-
14
9
 
15
10
  class TestBuilder < MiniTest::Test
16
11
 
17
12
  def test_build
18
-
13
+
19
14
  ['au','be'].each do |code|
20
15
  ## use/fix: ASCII-8BIT (e.g.keep as is) -???
21
16
  ## fix/todo: use ASCII8BIT/binary reader ??
@@ -1,5 +1,3 @@
1
- # encoding: utf-8
2
-
3
1
  ###
4
2
  # to run use
5
3
  # ruby -I ./lib -I ./test test/test_codes.rb
@@ -12,9 +10,7 @@ class TestCodes < MiniTest::Test
12
10
 
13
11
 
14
12
  def test_codes
15
-
16
- assert_equal 261, Factbook::CODES.size
17
- assert_equal 261, Factbook.codes.size
13
+ assert_equal 261, Factbook.codes.size
18
14
  assert_equal 261, Factbook.codes.to_a.size
19
15
 
20
16
 
@@ -27,7 +23,7 @@ class TestCodes < MiniTest::Test
27
23
 
28
24
  assert_equal 8, Factbook.codes.dependencies_us.size
29
25
 
30
-
26
+
31
27
  assert_equal 55, Factbook.codes.europe.size
32
28
  assert_equal 9, Factbook.codes.south_asia.size
33
29
  assert_equal 6, Factbook.codes.central_asia.size
@@ -43,11 +39,11 @@ class TestCodes < MiniTest::Test
43
39
  assert_equal 1, Factbook.codes.region('World').size
44
40
 
45
41
  assert_equal 45, Factbook.codes.countries.europe.size
46
-
42
+
47
43
  assert_equal Factbook.codes.category('Oceans').size, Factbook.codes.region('Oceans').size
48
44
  assert_equal Factbook.codes.category('World').size, Factbook.codes.region('World').size
49
45
 
50
-
46
+
51
47
  assert_equal 261, Factbook.codes.countries.size +
52
48
  Factbook.codes.others.size +
53
49
  Factbook.codes.dependencies.size +
@@ -68,7 +64,7 @@ class TestCodes < MiniTest::Test
68
64
  Factbook.codes.antartica.size +
69
65
  Factbook.codes.region('Oceans').size +
70
66
  Factbook.codes.region('World').size
71
-
67
+
72
68
  end
73
69
 
74
70
  end # class TestCodes
@@ -1,5 +1,3 @@
1
- # encoding: utf-8
2
-
3
1
  ###
4
2
  # to run use
5
3
  # ruby -I ./lib -I ./test test/test_comparisons.rb
@@ -11,9 +9,8 @@ require 'helper'
11
9
  class TestComparisons < MiniTest::Test
12
10
 
13
11
  def test_comparisons
14
- assert_equal 74, Factbook::COMPARISONS.size
15
- assert_equal 74, Factbook.comparisons.size
16
- assert_equal 74, Factbook.comparisons.to_a.size
12
+ assert_equal 74, Factbook.comparisons.size
13
+ assert_equal 74, Factbook.comparisons.to_a.size
17
14
  end
18
15
 
19
16
  end # class TestComparisons
@@ -1,5 +1,3 @@
1
- # encoding: utf-8
2
-
3
1
  ###
4
2
  # to run use
5
3
  # ruby -I ./lib -I ./test test/test_counter.rb
@@ -16,16 +14,16 @@ class TestCounter < MiniTest::Test
16
14
 
17
15
  def test_counter
18
16
  c = Factbook::Counter.new
19
-
17
+
20
18
  codes = %w(au be)
21
19
  codes.each do |code|
22
20
  c.count( read_test_page( code )) # use builtin test page (do NOT fetch via internet)
23
21
  end
24
-
22
+
25
23
  h = c.data
26
24
  pp h
27
-
28
- assert true ## assume everything ok if we get here
25
+
26
+ assert true ## assume everything ok if we get here
29
27
  end
30
28
 
31
29
  end # class TestCounter
@@ -1,5 +1,3 @@
1
- # encoding: utf-8
2
-
3
1
  ###
4
2
  # to run use
5
3
  # ruby -I ./lib -I ./test test/test_fields.rb
@@ -1,5 +1,3 @@
1
- # encoding: utf-8
2
-
3
1
  ###
4
2
  # to run use
5
3
  # ruby -I ./lib -I ./test test/test_item_builder.rb
@@ -15,10 +13,10 @@ class TestItemBuilder < MiniTest::Test
15
13
  html =<<EOS
16
14
  <div class=category_data>Central Europe, north of Italy and Slovenia</div>
17
15
  EOS
18
-
16
+
19
17
  b = Factbook::ItemBuilder.new( html, 'Location' )
20
18
  b.read
21
-
19
+
22
20
  assert true ## assume everthing ok
23
21
  end
24
22
 
@@ -31,7 +29,7 @@ EOS
31
29
 
32
30
  b = Factbook::ItemBuilder.new( html, 'Area' )
33
31
  b.read
34
-
32
+
35
33
  assert true ## assume everthing ok
36
34
  end
37
35
 
@@ -45,7 +43,7 @@ EOS
45
43
 
46
44
  b = Factbook::ItemBuilder.new( html, 'Land use' )
47
45
  b.read
48
-
46
+
49
47
  assert true ## assume everthing ok
50
48
  end
51
49
 
@@ -57,7 +55,7 @@ EOS
57
55
 
58
56
  b = Factbook::ItemBuilder.new( html, 'Contraceptive Prevalence Rate' )
59
57
  b.read
60
-
58
+
61
59
  assert true ## assume everthing ok
62
60
  end
63
61
 
@@ -75,7 +73,7 @@ EOS
75
73
 
76
74
  b = Factbook::ItemBuilder.new( html, 'Drinking Water Source' )
77
75
  b.read
78
-
76
+
79
77
  assert true ## assume everthing ok
80
78
  end
81
79
 
@@ -91,7 +89,7 @@ EOS
91
89
 
92
90
  b = Factbook::ItemBuilder.new( html, 'Political pressure groups and leaders' )
93
91
  b.read
94
-
92
+
95
93
  assert true ## assume everthing ok
96
94
  end
97
95
 
@@ -1,5 +1,3 @@
1
- # encoding: utf-8
2
-
3
1
  ###
4
2
  # to run use
5
3
  # ruby -I ./lib -I ./test test/test_json.rb
@@ -35,7 +33,7 @@ class TestJson < MiniTest::Test
35
33
 
36
34
  ### save to json
37
35
  puts "saving a copy to #{code}.json for debugging"
38
- File.open( "tmp/#{code}.json", 'w' ) do |f|
36
+ File.open( "tmp/#{code}.json", 'w:utf-8' ) do |f|
39
37
  f.write JSON.pretty_generate( h )
40
38
  ## f.write page.to_json
41
39
  end
@@ -1,5 +1,3 @@
1
- # encoding: utf-8
2
-
3
1
  ###
4
2
  # to run use
5
3
  # ruby -I ./lib -I ./test test/test_json_builder.rb
@@ -13,7 +11,7 @@ class TestJsonBuilder < MiniTest::Test
13
11
  def test_read
14
12
  code = 'au'
15
13
  b = Factbook::JsonBuilder.from_file( "#{Factbook.root}/test/data/json/#{code}.json" )
16
-
14
+
17
15
  assert_equal 10, b.sects.size
18
16
  assert_equal 1, b.sects[0].subsects.size ## e.g. Introduction/Background
19
17
  assert_equal 'Central Europe, north of Italy and Slovenia', b.json['Geography']['Location']['text']
@@ -1,5 +1,3 @@
1
- # encoding: utf-8
2
-
3
1
  ###
4
2
  # to run use
5
3
  # ruby -I ./lib -I ./test test/test_normalize.rb
@@ -1,5 +1,3 @@
1
- # encoding: utf-8
2
-
3
1
  ###
4
2
  # to run use
5
3
  # ruby -I ./lib -I ./test test/test_page.rb
@@ -22,14 +20,14 @@ class TestPage < MiniTest::Test
22
20
  # [ 'ls', 9 ],
23
21
  # [ 'vt', 8 ],
24
22
  ]
25
-
23
+
26
24
  pages.each do |rec|
27
25
  code = rec[0]
28
26
  sects_size = rec[1]
29
27
 
30
28
  html = File.read( "#{Factbook.root}/test/data/src/#{code}.html" )
31
29
  page = Factbook::Page.new( code, html: html )
32
-
30
+
33
31
  assert_equal sects_size, page.sects.size
34
32
  end
35
33
  end
@@ -1,5 +1,3 @@
1
- # encoding: utf-8
2
-
3
1
  ###
4
2
  # to run use
5
3
  # ruby -I ./lib -I ./test test/test_sanitizer.rb
@@ -18,10 +16,9 @@ class TestSanitizer < MiniTest::Test
18
16
  ## ['au'].each do |cnty|
19
17
  ['au','ag','be'].each do |cnty|
20
18
 
21
- ## use/fix: ASCII-8BIT (e.g.keep as is) -???
22
- html_ascii = File.read( "#{Factbook.root}/test/data/src/#{cnty}.html" ) ## fix/todo: use ASCII8BIT/binary reader ??
19
+ html_original = File.read( "#{Factbook.root}/test/data/src/#{cnty}.html", 'r:utf-8' ) { |f| r.read }
23
20
 
24
- html, info, errors = Factbook::Sanitizer.new.sanitize( html_ascii )
21
+ html, info, errors = Factbook::Sanitizer.new.sanitize( html_original )
25
22
 
26
23
  File.open( "./tmp/#{cnty}.profile.html", 'w' ) do |f|
27
24
  f.write "** info:\n"
@@ -1,5 +1,3 @@
1
- # encoding: utf-8
2
-
3
1
  ###
4
2
  # to run use
5
3
  # ruby -I ./lib -I ./test test/test_sanitizer_regex.rb
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: factbook-readers
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 1.0.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Gerald Bauer
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-11-26 00:00:00.000000000 Z
11
+ date: 2020-11-27 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: logutils
@@ -114,13 +114,11 @@ files:
114
114
  - Manifest.txt
115
115
  - README.md
116
116
  - Rakefile
117
- - data/attributes.yml
118
117
  - data/categories.csv
119
118
  - data/codes.csv
120
119
  - data/codesxref.csv
121
120
  - data/comparisons.csv
122
121
  - lib/factbook-readers.rb
123
- - lib/factbook-readers/attributes.rb
124
122
  - lib/factbook-readers/builder.rb
125
123
  - lib/factbook-readers/builder_item.rb
126
124
  - lib/factbook-readers/builder_json.rb
@@ -139,26 +137,12 @@ files:
139
137
  - lib/factbook-readers/utils_info.rb
140
138
  - lib/factbook-readers/version.rb
141
139
  - lib/factbook/readers.rb
142
- - test/data/au.html
143
- - test/data/au.yml
144
- - test/data/be.html
145
- - test/data/be.yml
146
- - test/data/json/au.json
147
- - test/data/src/ag.html
148
- - test/data/src/au-2015-09-24.html
149
- - test/data/src/au.html
150
- - test/data/src/be-2015-09-24.html
151
- - test/data/src/be.html
152
140
  - test/helper.rb
153
- - test/test_attribs.rb
154
- - test/test_attribs_def.rb
155
141
  - test/test_builder.rb
156
142
  - test/test_codes.rb
157
143
  - test/test_comparisons.rb
158
- - test/test_convert.rb
159
144
  - test/test_counter.rb
160
145
  - test/test_fields.rb
161
- - test/test_importer.rb
162
146
  - test/test_item_builder.rb
163
147
  - test/test_json.rb
164
148
  - test/test_json_builder.rb