factbook 1.2.1 → 1.2.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: ecdb08c833f823392dd8bacf5f0ca6186015ebb4
4
- data.tar.gz: c6829c25c02f075ef65e129bc396b1d2b3aba8f0
3
+ metadata.gz: 5ef14a3d11b87b854592f81c50406e43f4399ac3
4
+ data.tar.gz: 8c4eb43f4dfdca5b20d4dd60699b51c52a8cc7ff
5
5
  SHA512:
6
- metadata.gz: c40f0716ae27cc3c6d1a2a8ea9d2cdaf66b421e649396a9c141fa0b76539afed15fb694a567f7d8f705664bdf24751b8eb92b6b8bcda1c1a3869a5c2dbad75f4
7
- data.tar.gz: 456b32abafb674a3e0e4205cf70d97b6fc449e4c286e29c5cb96d25da72a75184c87c0aa0fee58d483747fd16c33d8a8287d77b55a1cfce8bb18c84534021439
6
+ metadata.gz: 57066d2a00d4818330ca710947fa5fedfb80ad224b84d31d906eb7606ba8127b8c1f102405a61f6dd3742b3c6f0fa4ec10f290a8d1da3367747a85ca31279a14
7
+ data.tar.gz: 6aa9fa498d6a1f836300e3fe985a5abd2cf8770211d414427608d2515f3d4fa56108b798e17a7e8d2b0f1bc52f8347787fd431cc97a099877eeafd89fffb12af
@@ -36,7 +36,7 @@ attr_reader :html_ascii, ## full "original" 1:1 page in "original/ascii8/bin
36
36
 
37
37
  def initialize( html_ascii )
38
38
  @html_ascii = html_ascii
39
-
39
+
40
40
  ## todo/fix: use/assume windows 12xx?? encoding - change encoding to utf-8 (from binary/ascii8bit)
41
41
  @html, @info, @errors = Sanitizer.new.sanitize( @html_ascii )
42
42
 
@@ -53,7 +53,7 @@ def initialize( html_ascii )
53
53
  html_subsects = html_sect[1]
54
54
  puts html_sect_head
55
55
  puts html_subsects.size
56
-
56
+
57
57
  ## get section title
58
58
  ## @SECTION{Economy} => Economy
59
59
  if html_sect_head =~ /@SECTION{(.+?)}/
@@ -74,7 +74,7 @@ def initialize( html_ascii )
74
74
  puts title
75
75
  subsect = Subsect.new
76
76
  subsect.title = title ## todo/fix: cut off trailing colon (:)
77
-
77
+
78
78
  b = Factbook::ItemBuilder.new( html_subsect_body, title )
79
79
  h = b.read
80
80
  subsect.data = h
@@ -88,32 +88,23 @@ def initialize( html_ascii )
88
88
  @sects << sect
89
89
  else
90
90
  ## warn/fix: no section title found
91
- end
91
+ end
92
92
  end
93
-
93
+
94
94
  self ## return self -- needed?? default (standard) anyway?? check and remove
95
95
  end
96
96
 
97
97
 
98
98
 
99
99
  def map_sects( html )
100
- ## convert section titles
101
- ## from <h2>..</h2>
102
- ## to "unified" marker
103
-
104
- ## e.g.
105
- ## <h2 sectiontitle='Introduction' ccode='au'>Introduction :: <span class='region'>AUSTRIA </span></h2>
106
- ## <h2>Introduction</h2>
100
+ ## convert section titles to "unified" marker
101
+ ## e.g.
102
+ ## <h2>Introduction</h2>
107
103
 
108
- title_regex= /<h2
109
- (?:\s[^>]+)? ## allow optional attributes in h2
110
- >
104
+ title_regex= /<h2>
111
105
  \s*
112
- ([^<>]+?) ## note: use non-greedy; do NOT allow tags inside for now
106
+ (.+?) ## note: use non-greedy; do NOT allow tags inside for now
113
107
  \s*
114
- (?:\s::\s
115
- .+? ## note: use non-greedy; allows tags inside
116
- )? ## strip optional name (e.g. :: AUSTRIA)
117
108
  <\/h2>
118
109
  /xim
119
110
 
@@ -121,33 +112,29 @@ def map_sects( html )
121
112
  puts "** found section >#{$1}<:"
122
113
  puts " >|#{m}|<"
123
114
 
124
- "\n\n@SECTION{#{$1}}\n\n"
115
+ "\n\n@SECTION{#{$1}}\n\n"
125
116
  end
126
117
  html
127
118
  end
128
119
 
129
120
 
130
121
  def map_subsects( html )
131
- ## convert subsection titles
132
- ## from <div id='field'>..</div>
133
- ## to "unified" marker
122
+ ## convert subsection titles to "unified" marker
123
+ ## e.g.
124
+ ## <h3>Disputes - international:</h3>
134
125
 
135
- ## e.g.
136
- ## <div id='field' class='category'>Disputes - international:</div>
137
-
138
- title_regex= /<div \s id='field'
139
- \s class='category'>
140
- \s*
126
+ title_regex= /<h3>
127
+ \s*
141
128
  (.+?) ## note: use non-greedy; allows tags inside - why? why not
142
- \s*
143
- <\/div>
129
+ \s*
130
+ <\/h3>
144
131
  /xim
145
132
 
146
133
  html = html.gsub( title_regex ) do |m|
147
134
  puts "** found subsection >#{$1}<:"
148
135
  puts " >|#{m}|<"
149
136
 
150
- "\n@SUBSECTION{#{$1}}\n"
137
+ "\n@SUBSECTION{#{$1}}\n"
151
138
  end
152
139
  html
153
140
  end
@@ -166,9 +153,9 @@ def split_sects( html )
166
153
  ## String#split will include all catpure groups in the result array
167
154
 
168
155
  section_regex= /(@SECTION{.+?})/ ## note: use non-greedy -- check: need to escape {} ??
169
-
156
+
170
157
  chunks = html.split( section_regex )
171
-
158
+
172
159
  ## check if first item is a section or (html) prolog
173
160
  # if prolog (remove)
174
161
  chunks.slice!(0) unless chunks[0] =~ /@SECTION/ ## starts w/ @SECTION
@@ -195,9 +182,9 @@ def split_subsects( html )
195
182
 
196
183
  ## note: "wrap" regex in a capture group (just one)
197
184
  ## String#split will include all catpure groups in the result array
198
-
185
+
199
186
  subsection_regex= /(@SUBSECTION{.+?})/ ## note: use non-greedy -- check: need to escape {} ??
200
-
187
+
201
188
  chunks = html.split( subsection_regex )
202
189
 
203
190
  ## check if first item is a section or (html) prolog
@@ -209,6 +209,35 @@ REL_AFFILIATION_CATEGORY_REGEX = /
209
209
  /xim
210
210
 
211
211
 
212
+ ##########################################
213
+ ## transforms / simplify
214
+ ##
215
+ ## <h2 sectiontitle='Introduction' ccode='ag'>
216
+ ## Introduction :: <span class='region'>ALGERIA </span>
217
+ ## </h2>
218
+ ## becomes =>
219
+ ## <h2>Introduction</h2>
220
+ ##
221
+ ## todo/fix: use named capture in future e.g.
222
+ ## (?<text>.+?) instead of (.+?)
223
+ ## not working for now w/ gsub (just passed in match string NOT match data)
224
+
225
+ CLEANUP_SECTION_REGEX = /
226
+ <h2 [^>]*>
227
+ (.+?)
228
+ <\/h2>
229
+ /xim
230
+
231
+ ##
232
+ ## <div id='field' class='category'>Electricity - consumption:</div>
233
+ ## becomes =>
234
+ ## <h3>Electricity - consumption:</h3>
235
+
236
+ CLEANUP_SUBSECTION_REGEX = /
237
+ <div \s id='field' [^>]*>
238
+ (.+?)
239
+ <\/div>
240
+ /xim
212
241
 
213
242
 
214
243
 
@@ -285,7 +314,7 @@ def sanitize_profile( html )
285
314
 
286
315
  klasses = $2.split(' ')
287
316
  klasses = klasses.select do |klass|
288
- if ['region', 'category', 'category_data'].include?( klass )
317
+ if ['category', 'category_data'].include?( klass )
289
318
  true
290
319
  else
291
320
  puts " remove class #{klass}"
@@ -300,6 +329,33 @@ def sanitize_profile( html )
300
329
  end
301
330
  end
302
331
 
332
+
333
+ ##################################################################
334
+ ## simplify/cleanup section and subsection headings
335
+
336
+ html = html.gsub( CLEANUP_SECTION_REGEX ) do |_|
337
+ puts " cleanup section (h2) heading >#{$1}<"
338
+
339
+ text = $1
340
+ pos = text.index( '::' )
341
+ if pos ## if includes => :: <span> Region </span> -- cut off
342
+ puts " remove :: region/country from heading"
343
+ text = text[0...pos]
344
+ end
345
+ text = text.strip # remove trailing space too
346
+
347
+ "<h2>#{text}</h2>"
348
+ end
349
+
350
+ html = html.gsub( CLEANUP_SUBSECTION_REGEX ) do |_|
351
+ puts " cleanup subsection (h3) heading >#{$1}<"
352
+
353
+ text = $1
354
+ text = text.strip # remove trailing space too
355
+
356
+ "<h3>#{text}</h3>"
357
+ end
358
+
303
359
  html
304
360
  end
305
361
 
@@ -4,7 +4,7 @@ module Factbook
4
4
 
5
5
  MAJOR = 1
6
6
  MINOR = 2
7
- PATCH = 1
7
+ PATCH = 2
8
8
  VERSION = [MAJOR,MINOR,PATCH].join('.')
9
9
 
10
10
  def self.version
@@ -15,6 +15,7 @@ class TestSanitizer < MiniTest::Test
15
15
  ## austria (au)
16
16
  ## algeria (ag)
17
17
  ## belgium (be)
18
+ ## ['au'].each do |cnty|
18
19
  ['au','ag','be'].each do |cnty|
19
20
 
20
21
  ## use/fix: ASCII-8BIT (e.g.keep as is) -???
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: factbook
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.2.1
4
+ version: 1.2.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Gerald Bauer
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-11-06 00:00:00.000000000 Z
11
+ date: 2016-11-08 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: logutils