factbook 1.2.1 → 1.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: ecdb08c833f823392dd8bacf5f0ca6186015ebb4
4
- data.tar.gz: c6829c25c02f075ef65e129bc396b1d2b3aba8f0
3
+ metadata.gz: 5ef14a3d11b87b854592f81c50406e43f4399ac3
4
+ data.tar.gz: 8c4eb43f4dfdca5b20d4dd60699b51c52a8cc7ff
5
5
  SHA512:
6
- metadata.gz: c40f0716ae27cc3c6d1a2a8ea9d2cdaf66b421e649396a9c141fa0b76539afed15fb694a567f7d8f705664bdf24751b8eb92b6b8bcda1c1a3869a5c2dbad75f4
7
- data.tar.gz: 456b32abafb674a3e0e4205cf70d97b6fc449e4c286e29c5cb96d25da72a75184c87c0aa0fee58d483747fd16c33d8a8287d77b55a1cfce8bb18c84534021439
6
+ metadata.gz: 57066d2a00d4818330ca710947fa5fedfb80ad224b84d31d906eb7606ba8127b8c1f102405a61f6dd3742b3c6f0fa4ec10f290a8d1da3367747a85ca31279a14
7
+ data.tar.gz: 6aa9fa498d6a1f836300e3fe985a5abd2cf8770211d414427608d2515f3d4fa56108b798e17a7e8d2b0f1bc52f8347787fd431cc97a099877eeafd89fffb12af
@@ -36,7 +36,7 @@ attr_reader :html_ascii, ## full "original" 1:1 page in "original/ascii8/bin
36
36
 
37
37
  def initialize( html_ascii )
38
38
  @html_ascii = html_ascii
39
-
39
+
40
40
  ## todo/fix: use/assume windows 12xx?? encoding - change encoding to utf-8 (from binary/ascii8bit)
41
41
  @html, @info, @errors = Sanitizer.new.sanitize( @html_ascii )
42
42
 
@@ -53,7 +53,7 @@ def initialize( html_ascii )
53
53
  html_subsects = html_sect[1]
54
54
  puts html_sect_head
55
55
  puts html_subsects.size
56
-
56
+
57
57
  ## get section title
58
58
  ## @SECTION{Economy} => Economy
59
59
  if html_sect_head =~ /@SECTION{(.+?)}/
@@ -74,7 +74,7 @@ def initialize( html_ascii )
74
74
  puts title
75
75
  subsect = Subsect.new
76
76
  subsect.title = title ## todo/fix: cut off trailing colon (:)
77
-
77
+
78
78
  b = Factbook::ItemBuilder.new( html_subsect_body, title )
79
79
  h = b.read
80
80
  subsect.data = h
@@ -88,32 +88,23 @@ def initialize( html_ascii )
88
88
  @sects << sect
89
89
  else
90
90
  ## warn/fix: no section title found
91
- end
91
+ end
92
92
  end
93
-
93
+
94
94
  self ## return self -- needed?? default (standard) anyway?? check and remove
95
95
  end
96
96
 
97
97
 
98
98
 
99
99
  def map_sects( html )
100
- ## convert section titles
101
- ## from <h2>..</h2>
102
- ## to "unified" marker
103
-
104
- ## e.g.
105
- ## <h2 sectiontitle='Introduction' ccode='au'>Introduction :: <span class='region'>AUSTRIA </span></h2>
106
- ## <h2>Introduction</h2>
100
+ ## convert section titles to "unified" marker
101
+ ## e.g.
102
+ ## <h2>Introduction</h2>
107
103
 
108
- title_regex= /<h2
109
- (?:\s[^>]+)? ## allow optional attributes in h2
110
- >
104
+ title_regex= /<h2>
111
105
  \s*
112
- ([^<>]+?) ## note: use non-greedy; do NOT allow tags inside for now
106
+ (.+?) ## note: use non-greedy; do NOT allow tags inside for now
113
107
  \s*
114
- (?:\s::\s
115
- .+? ## note: use non-greedy; allows tags inside
116
- )? ## strip optional name (e.g. :: AUSTRIA)
117
108
  <\/h2>
118
109
  /xim
119
110
 
@@ -121,33 +112,29 @@ def map_sects( html )
121
112
  puts "** found section >#{$1}<:"
122
113
  puts " >|#{m}|<"
123
114
 
124
- "\n\n@SECTION{#{$1}}\n\n"
115
+ "\n\n@SECTION{#{$1}}\n\n"
125
116
  end
126
117
  html
127
118
  end
128
119
 
129
120
 
130
121
  def map_subsects( html )
131
- ## convert subsection titles
132
- ## from <div id='field'>..</div>
133
- ## to "unified" marker
122
+ ## convert subsection titles to "unified" marker
123
+ ## e.g.
124
+ ## <h3>Disputes - international:</h3>
134
125
 
135
- ## e.g.
136
- ## <div id='field' class='category'>Disputes - international:</div>
137
-
138
- title_regex= /<div \s id='field'
139
- \s class='category'>
140
- \s*
126
+ title_regex= /<h3>
127
+ \s*
141
128
  (.+?) ## note: use non-greedy; allows tags inside - why? why not
142
- \s*
143
- <\/div>
129
+ \s*
130
+ <\/h3>
144
131
  /xim
145
132
 
146
133
  html = html.gsub( title_regex ) do |m|
147
134
  puts "** found subsection >#{$1}<:"
148
135
  puts " >|#{m}|<"
149
136
 
150
- "\n@SUBSECTION{#{$1}}\n"
137
+ "\n@SUBSECTION{#{$1}}\n"
151
138
  end
152
139
  html
153
140
  end
@@ -166,9 +153,9 @@ def split_sects( html )
166
153
  ## String#split will include all catpure groups in the result array
167
154
 
168
155
  section_regex= /(@SECTION{.+?})/ ## note: use non-greedy -- check: need to escape {} ??
169
-
156
+
170
157
  chunks = html.split( section_regex )
171
-
158
+
172
159
  ## check if first item is a section or (html) prolog
173
160
  # if prolog (remove)
174
161
  chunks.slice!(0) unless chunks[0] =~ /@SECTION/ ## starts w/ @SECTION
@@ -195,9 +182,9 @@ def split_subsects( html )
195
182
 
196
183
  ## note: "wrap" regex in a capture group (just one)
197
184
  ## String#split will include all catpure groups in the result array
198
-
185
+
199
186
  subsection_regex= /(@SUBSECTION{.+?})/ ## note: use non-greedy -- check: need to escape {} ??
200
-
187
+
201
188
  chunks = html.split( subsection_regex )
202
189
 
203
190
  ## check if first item is a section or (html) prolog
@@ -209,6 +209,35 @@ REL_AFFILIATION_CATEGORY_REGEX = /
209
209
  /xim
210
210
 
211
211
 
212
+ ##########################################
213
+ ## transforms / simplify
214
+ ##
215
+ ## <h2 sectiontitle='Introduction' ccode='ag'>
216
+ ## Introduction :: <span class='region'>ALGERIA </span>
217
+ ## </h2>
218
+ ## becomes =>
219
+ ## <h2>Introduction</h2>
220
+ ##
221
+ ## todo/fix: use named capture in future e.g.
222
+ ## (?<text>.+?) instead of (.+?)
223
+ ## not working for now w/ gsub (just passed in match string NOT match data)
224
+
225
+ CLEANUP_SECTION_REGEX = /
226
+ <h2 [^>]*>
227
+ (.+?)
228
+ <\/h2>
229
+ /xim
230
+
231
+ ##
232
+ ## <div id='field' class='category'>Electricity - consumption:</div>
233
+ ## becomes =>
234
+ ## <h3>Electricity - consumption:</h3>
235
+
236
+ CLEANUP_SUBSECTION_REGEX = /
237
+ <div \s id='field' [^>]*>
238
+ (.+?)
239
+ <\/div>
240
+ /xim
212
241
 
213
242
 
214
243
 
@@ -285,7 +314,7 @@ def sanitize_profile( html )
285
314
 
286
315
  klasses = $2.split(' ')
287
316
  klasses = klasses.select do |klass|
288
- if ['region', 'category', 'category_data'].include?( klass )
317
+ if ['category', 'category_data'].include?( klass )
289
318
  true
290
319
  else
291
320
  puts " remove class #{klass}"
@@ -300,6 +329,33 @@ def sanitize_profile( html )
300
329
  end
301
330
  end
302
331
 
332
+
333
+ ##################################################################
334
+ ## simplify/cleanup section and subsection headings
335
+
336
+ html = html.gsub( CLEANUP_SECTION_REGEX ) do |_|
337
+ puts " cleanup section (h2) heading >#{$1}<"
338
+
339
+ text = $1
340
+ pos = text.index( '::' )
341
+ if pos ## if includes => :: <span> Region </span> -- cut off
342
+ puts " remove :: region/country from heading"
343
+ text = text[0...pos]
344
+ end
345
+ text = text.strip # remove trailing space too
346
+
347
+ "<h2>#{text}</h2>"
348
+ end
349
+
350
+ html = html.gsub( CLEANUP_SUBSECTION_REGEX ) do |_|
351
+ puts " cleanup subsection (h3) heading >#{$1}<"
352
+
353
+ text = $1
354
+ text = text.strip # remove trailing space too
355
+
356
+ "<h3>#{text}</h3>"
357
+ end
358
+
303
359
  html
304
360
  end
305
361
 
@@ -4,7 +4,7 @@ module Factbook
4
4
 
5
5
  MAJOR = 1
6
6
  MINOR = 2
7
- PATCH = 1
7
+ PATCH = 2
8
8
  VERSION = [MAJOR,MINOR,PATCH].join('.')
9
9
 
10
10
  def self.version
@@ -15,6 +15,7 @@ class TestSanitizer < MiniTest::Test
15
15
  ## austria (au)
16
16
  ## algeria (ag)
17
17
  ## belgium (be)
18
+ ## ['au'].each do |cnty|
18
19
  ['au','ag','be'].each do |cnty|
19
20
 
20
21
  ## use/fix: ASCII-8BIT (e.g.keep as is) -???
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: factbook
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.2.1
4
+ version: 1.2.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Gerald Bauer
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-11-06 00:00:00.000000000 Z
11
+ date: 2016-11-08 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: logutils