factbook 1.2.1 → 1.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/factbook/builder.rb +23 -36
- data/lib/factbook/sanitizer.rb +57 -1
- data/lib/factbook/version.rb +1 -1
- data/test/test_sanitizer.rb +1 -0
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 5ef14a3d11b87b854592f81c50406e43f4399ac3
|
4
|
+
data.tar.gz: 8c4eb43f4dfdca5b20d4dd60699b51c52a8cc7ff
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 57066d2a00d4818330ca710947fa5fedfb80ad224b84d31d906eb7606ba8127b8c1f102405a61f6dd3742b3c6f0fa4ec10f290a8d1da3367747a85ca31279a14
|
7
|
+
data.tar.gz: 6aa9fa498d6a1f836300e3fe985a5abd2cf8770211d414427608d2515f3d4fa56108b798e17a7e8d2b0f1bc52f8347787fd431cc97a099877eeafd89fffb12af
|
data/lib/factbook/builder.rb
CHANGED
@@ -36,7 +36,7 @@ attr_reader :html_ascii, ## full "original" 1:1 page in "original/ascii8/bin
|
|
36
36
|
|
37
37
|
def initialize( html_ascii )
|
38
38
|
@html_ascii = html_ascii
|
39
|
-
|
39
|
+
|
40
40
|
## todo/fix: use/assume windows 12xx?? encoding - change encoding to utf-8 (from binary/ascii8bit)
|
41
41
|
@html, @info, @errors = Sanitizer.new.sanitize( @html_ascii )
|
42
42
|
|
@@ -53,7 +53,7 @@ def initialize( html_ascii )
|
|
53
53
|
html_subsects = html_sect[1]
|
54
54
|
puts html_sect_head
|
55
55
|
puts html_subsects.size
|
56
|
-
|
56
|
+
|
57
57
|
## get section title
|
58
58
|
## @SECTION{Economy} => Economy
|
59
59
|
if html_sect_head =~ /@SECTION{(.+?)}/
|
@@ -74,7 +74,7 @@ def initialize( html_ascii )
|
|
74
74
|
puts title
|
75
75
|
subsect = Subsect.new
|
76
76
|
subsect.title = title ## todo/fix: cut off trailing colon (:)
|
77
|
-
|
77
|
+
|
78
78
|
b = Factbook::ItemBuilder.new( html_subsect_body, title )
|
79
79
|
h = b.read
|
80
80
|
subsect.data = h
|
@@ -88,32 +88,23 @@ def initialize( html_ascii )
|
|
88
88
|
@sects << sect
|
89
89
|
else
|
90
90
|
## warn/fix: no section title found
|
91
|
-
end
|
91
|
+
end
|
92
92
|
end
|
93
|
-
|
93
|
+
|
94
94
|
self ## return self -- needed?? default (standard) anyway?? check and remove
|
95
95
|
end
|
96
96
|
|
97
97
|
|
98
98
|
|
99
99
|
def map_sects( html )
|
100
|
-
## convert section titles
|
101
|
-
##
|
102
|
-
##
|
103
|
-
|
104
|
-
## e.g.
|
105
|
-
## <h2 sectiontitle='Introduction' ccode='au'>Introduction :: <span class='region'>AUSTRIA </span></h2>
|
106
|
-
## <h2>Introduction</h2>
|
100
|
+
## convert section titles to "unified" marker
|
101
|
+
## e.g.
|
102
|
+
## <h2>Introduction</h2>
|
107
103
|
|
108
|
-
title_regex= /<h2
|
109
|
-
(?:\s[^>]+)? ## allow optional attributes in h2
|
110
|
-
>
|
104
|
+
title_regex= /<h2>
|
111
105
|
\s*
|
112
|
-
(
|
106
|
+
(.+?) ## note: use non-greedy; do NOT allow tags inside for now
|
113
107
|
\s*
|
114
|
-
(?:\s::\s
|
115
|
-
.+? ## note: use non-greedy; allows tags inside
|
116
|
-
)? ## strip optional name (e.g. :: AUSTRIA)
|
117
108
|
<\/h2>
|
118
109
|
/xim
|
119
110
|
|
@@ -121,33 +112,29 @@ def map_sects( html )
|
|
121
112
|
puts "** found section >#{$1}<:"
|
122
113
|
puts " >|#{m}|<"
|
123
114
|
|
124
|
-
"\n\n@SECTION{#{$1}}\n\n"
|
115
|
+
"\n\n@SECTION{#{$1}}\n\n"
|
125
116
|
end
|
126
117
|
html
|
127
118
|
end
|
128
119
|
|
129
120
|
|
130
121
|
def map_subsects( html )
|
131
|
-
## convert subsection titles
|
132
|
-
##
|
133
|
-
##
|
122
|
+
## convert subsection titles to "unified" marker
|
123
|
+
## e.g.
|
124
|
+
## <h3>Disputes - international:</h3>
|
134
125
|
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
title_regex= /<div \s id='field'
|
139
|
-
\s class='category'>
|
140
|
-
\s*
|
126
|
+
title_regex= /<h3>
|
127
|
+
\s*
|
141
128
|
(.+?) ## note: use non-greedy; allows tags inside - why? why not
|
142
|
-
|
143
|
-
<\/
|
129
|
+
\s*
|
130
|
+
<\/h3>
|
144
131
|
/xim
|
145
132
|
|
146
133
|
html = html.gsub( title_regex ) do |m|
|
147
134
|
puts "** found subsection >#{$1}<:"
|
148
135
|
puts " >|#{m}|<"
|
149
136
|
|
150
|
-
"\n@SUBSECTION{#{$1}}\n"
|
137
|
+
"\n@SUBSECTION{#{$1}}\n"
|
151
138
|
end
|
152
139
|
html
|
153
140
|
end
|
@@ -166,9 +153,9 @@ def split_sects( html )
|
|
166
153
|
## String#split will include all catpure groups in the result array
|
167
154
|
|
168
155
|
section_regex= /(@SECTION{.+?})/ ## note: use non-greedy -- check: need to escape {} ??
|
169
|
-
|
156
|
+
|
170
157
|
chunks = html.split( section_regex )
|
171
|
-
|
158
|
+
|
172
159
|
## check if first item is a section or (html) prolog
|
173
160
|
# if prolog (remove)
|
174
161
|
chunks.slice!(0) unless chunks[0] =~ /@SECTION/ ## starts w/ @SECTION
|
@@ -195,9 +182,9 @@ def split_subsects( html )
|
|
195
182
|
|
196
183
|
## note: "wrap" regex in a capture group (just one)
|
197
184
|
## String#split will include all catpure groups in the result array
|
198
|
-
|
185
|
+
|
199
186
|
subsection_regex= /(@SUBSECTION{.+?})/ ## note: use non-greedy -- check: need to escape {} ??
|
200
|
-
|
187
|
+
|
201
188
|
chunks = html.split( subsection_regex )
|
202
189
|
|
203
190
|
## check if first item is a section or (html) prolog
|
data/lib/factbook/sanitizer.rb
CHANGED
@@ -209,6 +209,35 @@ REL_AFFILIATION_CATEGORY_REGEX = /
|
|
209
209
|
/xim
|
210
210
|
|
211
211
|
|
212
|
+
##########################################
|
213
|
+
## transforms / simplify
|
214
|
+
##
|
215
|
+
## <h2 sectiontitle='Introduction' ccode='ag'>
|
216
|
+
## Introduction :: <span class='region'>ALGERIA </span>
|
217
|
+
## </h2>
|
218
|
+
## becomes =>
|
219
|
+
## <h2>Introduction</h2>
|
220
|
+
##
|
221
|
+
## todo/fix: use named capture in future e.g.
|
222
|
+
## (?<text>.+?) instead of (.+?)
|
223
|
+
## not working for now w/ gsub (just passed in match string NOT match data)
|
224
|
+
|
225
|
+
CLEANUP_SECTION_REGEX = /
|
226
|
+
<h2 [^>]*>
|
227
|
+
(.+?)
|
228
|
+
<\/h2>
|
229
|
+
/xim
|
230
|
+
|
231
|
+
##
|
232
|
+
## <div id='field' class='category'>Electricity - consumption:</div>
|
233
|
+
## becomes =>
|
234
|
+
## <h3>Electricity - consumption:</h3>
|
235
|
+
|
236
|
+
CLEANUP_SUBSECTION_REGEX = /
|
237
|
+
<div \s id='field' [^>]*>
|
238
|
+
(.+?)
|
239
|
+
<\/div>
|
240
|
+
/xim
|
212
241
|
|
213
242
|
|
214
243
|
|
@@ -285,7 +314,7 @@ def sanitize_profile( html )
|
|
285
314
|
|
286
315
|
klasses = $2.split(' ')
|
287
316
|
klasses = klasses.select do |klass|
|
288
|
-
if ['
|
317
|
+
if ['category', 'category_data'].include?( klass )
|
289
318
|
true
|
290
319
|
else
|
291
320
|
puts " remove class #{klass}"
|
@@ -300,6 +329,33 @@ def sanitize_profile( html )
|
|
300
329
|
end
|
301
330
|
end
|
302
331
|
|
332
|
+
|
333
|
+
##################################################################
|
334
|
+
## simplify/cleanup section and subsection headings
|
335
|
+
|
336
|
+
html = html.gsub( CLEANUP_SECTION_REGEX ) do |_|
|
337
|
+
puts " cleanup section (h2) heading >#{$1}<"
|
338
|
+
|
339
|
+
text = $1
|
340
|
+
pos = text.index( '::' )
|
341
|
+
if pos ## if includes => :: <span> Region </span> -- cut off
|
342
|
+
puts " remove :: region/country from heading"
|
343
|
+
text = text[0...pos]
|
344
|
+
end
|
345
|
+
text = text.strip # remove trailing space too
|
346
|
+
|
347
|
+
"<h2>#{text}</h2>"
|
348
|
+
end
|
349
|
+
|
350
|
+
html = html.gsub( CLEANUP_SUBSECTION_REGEX ) do |_|
|
351
|
+
puts " cleanup subsection (h3) heading >#{$1}<"
|
352
|
+
|
353
|
+
text = $1
|
354
|
+
text = text.strip # remove trailing space too
|
355
|
+
|
356
|
+
"<h3>#{text}</h3>"
|
357
|
+
end
|
358
|
+
|
303
359
|
html
|
304
360
|
end
|
305
361
|
|
data/lib/factbook/version.rb
CHANGED
data/test/test_sanitizer.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: factbook
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.2.
|
4
|
+
version: 1.2.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Gerald Bauer
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-11-
|
11
|
+
date: 2016-11-08 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: logutils
|