factbook 1.2.1 → 1.2.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/factbook/builder.rb +23 -36
- data/lib/factbook/sanitizer.rb +57 -1
- data/lib/factbook/version.rb +1 -1
- data/test/test_sanitizer.rb +1 -0
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 5ef14a3d11b87b854592f81c50406e43f4399ac3
|
4
|
+
data.tar.gz: 8c4eb43f4dfdca5b20d4dd60699b51c52a8cc7ff
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 57066d2a00d4818330ca710947fa5fedfb80ad224b84d31d906eb7606ba8127b8c1f102405a61f6dd3742b3c6f0fa4ec10f290a8d1da3367747a85ca31279a14
|
7
|
+
data.tar.gz: 6aa9fa498d6a1f836300e3fe985a5abd2cf8770211d414427608d2515f3d4fa56108b798e17a7e8d2b0f1bc52f8347787fd431cc97a099877eeafd89fffb12af
|
data/lib/factbook/builder.rb
CHANGED
@@ -36,7 +36,7 @@ attr_reader :html_ascii, ## full "original" 1:1 page in "original/ascii8/bin
|
|
36
36
|
|
37
37
|
def initialize( html_ascii )
|
38
38
|
@html_ascii = html_ascii
|
39
|
-
|
39
|
+
|
40
40
|
## todo/fix: use/assume windows 12xx?? encoding - change encoding to utf-8 (from binary/ascii8bit)
|
41
41
|
@html, @info, @errors = Sanitizer.new.sanitize( @html_ascii )
|
42
42
|
|
@@ -53,7 +53,7 @@ def initialize( html_ascii )
|
|
53
53
|
html_subsects = html_sect[1]
|
54
54
|
puts html_sect_head
|
55
55
|
puts html_subsects.size
|
56
|
-
|
56
|
+
|
57
57
|
## get section title
|
58
58
|
## @SECTION{Economy} => Economy
|
59
59
|
if html_sect_head =~ /@SECTION{(.+?)}/
|
@@ -74,7 +74,7 @@ def initialize( html_ascii )
|
|
74
74
|
puts title
|
75
75
|
subsect = Subsect.new
|
76
76
|
subsect.title = title ## todo/fix: cut off trailing colon (:)
|
77
|
-
|
77
|
+
|
78
78
|
b = Factbook::ItemBuilder.new( html_subsect_body, title )
|
79
79
|
h = b.read
|
80
80
|
subsect.data = h
|
@@ -88,32 +88,23 @@ def initialize( html_ascii )
|
|
88
88
|
@sects << sect
|
89
89
|
else
|
90
90
|
## warn/fix: no section title found
|
91
|
-
end
|
91
|
+
end
|
92
92
|
end
|
93
|
-
|
93
|
+
|
94
94
|
self ## return self -- needed?? default (standard) anyway?? check and remove
|
95
95
|
end
|
96
96
|
|
97
97
|
|
98
98
|
|
99
99
|
def map_sects( html )
|
100
|
-
## convert section titles
|
101
|
-
##
|
102
|
-
##
|
103
|
-
|
104
|
-
## e.g.
|
105
|
-
## <h2 sectiontitle='Introduction' ccode='au'>Introduction :: <span class='region'>AUSTRIA </span></h2>
|
106
|
-
## <h2>Introduction</h2>
|
100
|
+
## convert section titles to "unified" marker
|
101
|
+
## e.g.
|
102
|
+
## <h2>Introduction</h2>
|
107
103
|
|
108
|
-
title_regex= /<h2
|
109
|
-
(?:\s[^>]+)? ## allow optional attributes in h2
|
110
|
-
>
|
104
|
+
title_regex= /<h2>
|
111
105
|
\s*
|
112
|
-
(
|
106
|
+
(.+?) ## note: use non-greedy; do NOT allow tags inside for now
|
113
107
|
\s*
|
114
|
-
(?:\s::\s
|
115
|
-
.+? ## note: use non-greedy; allows tags inside
|
116
|
-
)? ## strip optional name (e.g. :: AUSTRIA)
|
117
108
|
<\/h2>
|
118
109
|
/xim
|
119
110
|
|
@@ -121,33 +112,29 @@ def map_sects( html )
|
|
121
112
|
puts "** found section >#{$1}<:"
|
122
113
|
puts " >|#{m}|<"
|
123
114
|
|
124
|
-
"\n\n@SECTION{#{$1}}\n\n"
|
115
|
+
"\n\n@SECTION{#{$1}}\n\n"
|
125
116
|
end
|
126
117
|
html
|
127
118
|
end
|
128
119
|
|
129
120
|
|
130
121
|
def map_subsects( html )
|
131
|
-
## convert subsection titles
|
132
|
-
##
|
133
|
-
##
|
122
|
+
## convert subsection titles to "unified" marker
|
123
|
+
## e.g.
|
124
|
+
## <h3>Disputes - international:</h3>
|
134
125
|
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
title_regex= /<div \s id='field'
|
139
|
-
\s class='category'>
|
140
|
-
\s*
|
126
|
+
title_regex= /<h3>
|
127
|
+
\s*
|
141
128
|
(.+?) ## note: use non-greedy; allows tags inside - why? why not
|
142
|
-
|
143
|
-
<\/
|
129
|
+
\s*
|
130
|
+
<\/h3>
|
144
131
|
/xim
|
145
132
|
|
146
133
|
html = html.gsub( title_regex ) do |m|
|
147
134
|
puts "** found subsection >#{$1}<:"
|
148
135
|
puts " >|#{m}|<"
|
149
136
|
|
150
|
-
"\n@SUBSECTION{#{$1}}\n"
|
137
|
+
"\n@SUBSECTION{#{$1}}\n"
|
151
138
|
end
|
152
139
|
html
|
153
140
|
end
|
@@ -166,9 +153,9 @@ def split_sects( html )
|
|
166
153
|
## String#split will include all catpure groups in the result array
|
167
154
|
|
168
155
|
section_regex= /(@SECTION{.+?})/ ## note: use non-greedy -- check: need to escape {} ??
|
169
|
-
|
156
|
+
|
170
157
|
chunks = html.split( section_regex )
|
171
|
-
|
158
|
+
|
172
159
|
## check if first item is a section or (html) prolog
|
173
160
|
# if prolog (remove)
|
174
161
|
chunks.slice!(0) unless chunks[0] =~ /@SECTION/ ## starts w/ @SECTION
|
@@ -195,9 +182,9 @@ def split_subsects( html )
|
|
195
182
|
|
196
183
|
## note: "wrap" regex in a capture group (just one)
|
197
184
|
## String#split will include all catpure groups in the result array
|
198
|
-
|
185
|
+
|
199
186
|
subsection_regex= /(@SUBSECTION{.+?})/ ## note: use non-greedy -- check: need to escape {} ??
|
200
|
-
|
187
|
+
|
201
188
|
chunks = html.split( subsection_regex )
|
202
189
|
|
203
190
|
## check if first item is a section or (html) prolog
|
data/lib/factbook/sanitizer.rb
CHANGED
@@ -209,6 +209,35 @@ REL_AFFILIATION_CATEGORY_REGEX = /
|
|
209
209
|
/xim
|
210
210
|
|
211
211
|
|
212
|
+
##########################################
|
213
|
+
## transforms / simplify
|
214
|
+
##
|
215
|
+
## <h2 sectiontitle='Introduction' ccode='ag'>
|
216
|
+
## Introduction :: <span class='region'>ALGERIA </span>
|
217
|
+
## </h2>
|
218
|
+
## becomes =>
|
219
|
+
## <h2>Introduction</h2>
|
220
|
+
##
|
221
|
+
## todo/fix: use named capture in future e.g.
|
222
|
+
## (?<text>.+?) instead of (.+?)
|
223
|
+
## not working for now w/ gsub (just passed in match string NOT match data)
|
224
|
+
|
225
|
+
CLEANUP_SECTION_REGEX = /
|
226
|
+
<h2 [^>]*>
|
227
|
+
(.+?)
|
228
|
+
<\/h2>
|
229
|
+
/xim
|
230
|
+
|
231
|
+
##
|
232
|
+
## <div id='field' class='category'>Electricity - consumption:</div>
|
233
|
+
## becomes =>
|
234
|
+
## <h3>Electricity - consumption:</h3>
|
235
|
+
|
236
|
+
CLEANUP_SUBSECTION_REGEX = /
|
237
|
+
<div \s id='field' [^>]*>
|
238
|
+
(.+?)
|
239
|
+
<\/div>
|
240
|
+
/xim
|
212
241
|
|
213
242
|
|
214
243
|
|
@@ -285,7 +314,7 @@ def sanitize_profile( html )
|
|
285
314
|
|
286
315
|
klasses = $2.split(' ')
|
287
316
|
klasses = klasses.select do |klass|
|
288
|
-
if ['
|
317
|
+
if ['category', 'category_data'].include?( klass )
|
289
318
|
true
|
290
319
|
else
|
291
320
|
puts " remove class #{klass}"
|
@@ -300,6 +329,33 @@ def sanitize_profile( html )
|
|
300
329
|
end
|
301
330
|
end
|
302
331
|
|
332
|
+
|
333
|
+
##################################################################
|
334
|
+
## simplify/cleanup section and subsection headings
|
335
|
+
|
336
|
+
html = html.gsub( CLEANUP_SECTION_REGEX ) do |_|
|
337
|
+
puts " cleanup section (h2) heading >#{$1}<"
|
338
|
+
|
339
|
+
text = $1
|
340
|
+
pos = text.index( '::' )
|
341
|
+
if pos ## if includes => :: <span> Region </span> -- cut off
|
342
|
+
puts " remove :: region/country from heading"
|
343
|
+
text = text[0...pos]
|
344
|
+
end
|
345
|
+
text = text.strip # remove trailing space too
|
346
|
+
|
347
|
+
"<h2>#{text}</h2>"
|
348
|
+
end
|
349
|
+
|
350
|
+
html = html.gsub( CLEANUP_SUBSECTION_REGEX ) do |_|
|
351
|
+
puts " cleanup subsection (h3) heading >#{$1}<"
|
352
|
+
|
353
|
+
text = $1
|
354
|
+
text = text.strip # remove trailing space too
|
355
|
+
|
356
|
+
"<h3>#{text}</h3>"
|
357
|
+
end
|
358
|
+
|
303
359
|
html
|
304
360
|
end
|
305
361
|
|
data/lib/factbook/version.rb
CHANGED
data/test/test_sanitizer.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: factbook
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.2.
|
4
|
+
version: 1.2.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Gerald Bauer
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-11-
|
11
|
+
date: 2016-11-08 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: logutils
|