factbook 0.1.3 → 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/Manifest.txt +34 -22
- data/README.md +8 -3
- data/Rakefile +2 -263
- data/data/codes.csv +262 -0
- data/data/comparisons.csv +75 -0
- data/lib/factbook/builder.rb +214 -0
- data/lib/factbook/builder_item.rb +93 -0
- data/lib/factbook/codes.rb +119 -0
- data/lib/factbook/comparisons.rb +50 -0
- data/lib/factbook/page.rb +103 -303
- data/lib/factbook/sanitizer.rb +214 -0
- data/lib/factbook/sect.rb +29 -196
- data/lib/factbook/subsect.rb +18 -0
- data/lib/factbook/table.rb +52 -0
- data/lib/factbook/utils.rb +85 -0
- data/lib/factbook/utils_info.rb +102 -0
- data/lib/factbook/version.rb +4 -3
- data/lib/factbook.rb +23 -1
- data/test/data/au.html +579 -0
- data/test/data/au.yml +8 -0
- data/test/data/be.html +596 -0
- data/test/data/be.yml +8 -0
- data/test/data/src/au.html +2006 -0
- data/test/data/src/be.html +2011 -0
- data/test/helper.rb +0 -4
- data/test/test_builder.rb +37 -0
- data/test/test_codes.rb +76 -0
- data/test/test_comparisons.rb +19 -0
- data/test/test_fields.rb +21 -18
- data/test/test_item_builder.rb +99 -0
- data/test/test_json.rb +17 -20
- data/test/test_page.rb +18 -10
- data/test/test_sanitizer.rb +35 -0
- metadata +68 -49
- data/.gemtest +0 -0
- data/test/data/countrytemplate_au.html +0 -4179
- data/test/data/countrytemplate_be.html +0 -4260
- data/test/data/countrytemplate_br.html +0 -4366
- data/test/data/countrytemplate_ee.html +0 -2999
- data/test/data/countrytemplate_ls.html +0 -2728
- data/test/data/countrytemplate_mx.html +0 -4397
- data/test/data/countrytemplate_vt.html +0 -1726
- data/test/data/countrytemplate_xx.html +0 -2898
- data/test/test_page_old.rb +0 -478
- data/test/test_strip.rb +0 -66
data/test/test_page_old.rb
DELETED
@@ -1,478 +0,0 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
|
-
|
4
|
-
require 'helper'
|
5
|
-
|
6
|
-
|
7
|
-
class TestPageOld < MiniTest::Unit::TestCase
|
8
|
-
|
9
|
-
|
10
|
-
def xxx_test_br
|
11
|
-
Dir.mkdir( 'tmp' ) unless Dir.exists?( 'tmp' )
|
12
|
-
|
13
|
-
page = Factbook::Page.new( 'br' )
|
14
|
-
|
15
|
-
## print first 600 chars
|
16
|
-
pp page.html[0..600]
|
17
|
-
|
18
|
-
## save for debuging
|
19
|
-
|
20
|
-
Dir.mkdir( 'tmp' ) unless Dir.exists?( 'tmp' )
|
21
|
-
puts "saving a copy to br.html for debugging"
|
22
|
-
File.open( 'tmp/br.html', 'w') do |f|
|
23
|
-
f.write( page.html )
|
24
|
-
end
|
25
|
-
|
26
|
-
doc = page.doc
|
27
|
-
sects = page.sects
|
28
|
-
|
29
|
-
rows = doc.css( 'table tr' )
|
30
|
-
field_ids = rows.css( '#field' ) ## check - use div#field.category -- possible?
|
31
|
-
data_ids = rows.css( '#data' )
|
32
|
-
|
33
|
-
puts "rows.size: #{rows.size} (field_ids.size: #{field_ids.size} / data_ids.size: #{data_ids.size})"
|
34
|
-
|
35
|
-
cats = rows.css( '.category' )
|
36
|
-
cats_div = rows.css( 'div.category' )
|
37
|
-
cats_span = rows.css( 'span.category' )
|
38
|
-
cats_other_size = cats.size - cats_div.size - cats_span.size
|
39
|
-
|
40
|
-
cats_data = rows.css( '.category_data' )
|
41
|
-
cats_div_data = rows.css( 'div.category_data' )
|
42
|
-
cats_span_data = rows.css( 'span.category_data' )
|
43
|
-
cats_other_data_size = cats_data.size - cats_div_data.size - cats_span_data.size
|
44
|
-
|
45
|
-
puts "cats.size: #{cats.size} (cats_div.size #{cats_div.size} / cats_span.size #{cats_span.size} / cats_other.size #{cats_other_size})"
|
46
|
-
puts "cats_data.size: #{cats_data.size} (cats_div_data.size #{cats_div_data.size} / cats_span_data.size #{cats_span_data.size} / cats_other_data.size #{cats_other_data_size})"
|
47
|
-
|
48
|
-
## some check for structure
|
49
|
-
if cats_other_size > 0
|
50
|
-
puts " ****!!!! category other (not div/span) found - #{cats_other_size}"
|
51
|
-
end
|
52
|
-
|
53
|
-
if cats_other_data_size > 0
|
54
|
-
puts " ****!!!! category_data other (not div/span) found - #{cats_other_data_size}"
|
55
|
-
end
|
56
|
-
|
57
|
-
## stats( doc )
|
58
|
-
|
59
|
-
sects.each_with_index do |sect,i|
|
60
|
-
puts ''
|
61
|
-
puts "############################"
|
62
|
-
puts "#### stats sect #{i}:"
|
63
|
-
pp page.sect_to_hash( sect )
|
64
|
-
end
|
65
|
-
end
|
66
|
-
|
67
|
-
|
68
|
-
def xxx_stats( doc )
|
69
|
-
rows = doc.css( 'table tr' )
|
70
|
-
cells = doc.css( 'table tr td' )
|
71
|
-
field_ids = rows.css( '#field' ) ## check - use div#field.category -- possible?
|
72
|
-
data_ids = rows.css( '#data' )
|
73
|
-
|
74
|
-
puts "rows.size: #{rows.size} (cells.size: #{cells.size} / field_ids.size: #{field_ids.size} / data_ids.size: #{data_ids.size})"
|
75
|
-
|
76
|
-
hash = {}
|
77
|
-
last_cat = nil
|
78
|
-
|
79
|
-
|
80
|
-
cells.each_with_index do |cell,i|
|
81
|
-
## next if i > 14 ## skip after xx for debugging for now
|
82
|
-
|
83
|
-
# check if field or data id
|
84
|
-
|
85
|
-
# check for (nested) div#field in td
|
86
|
-
has_field_id = cell.css( '#field' ).size == 1 ? true : false
|
87
|
-
|
88
|
-
# check for td#data
|
89
|
-
has_data_id = cell['id'] == 'data' ? true : false
|
90
|
-
|
91
|
-
if has_field_id
|
92
|
-
|
93
|
-
cats = cell.css( 'div.category' ) ## note: ignore all .category not using div (issue warn/err if found!!) etc.
|
94
|
-
if cats.size == 1
|
95
|
-
text = cats.first.text.strip # remove/strip leading and trailing spaces
|
96
|
-
last_cat = text
|
97
|
-
puts " [#{i}] category: >>#{text}<<"
|
98
|
-
else
|
99
|
-
puts "**** !!!!!! warn/err - found element w/ field id (no match for subsection!!! - check)"
|
100
|
-
puts cell.to_s
|
101
|
-
end
|
102
|
-
|
103
|
-
elsif has_data_id
|
104
|
-
|
105
|
-
cats = cell.css( 'div.category' ) ## note: ignore all .category not using div (issue warn/err if found!!) etc.
|
106
|
-
cats_data = cell.css( 'div.category_data,span.category_data' ) ## note: ignore a.category_data etc.
|
107
|
-
cats_div_data = cell.css( 'div.category_data' )
|
108
|
-
cats_span_data = cell.css( 'span.category_data' )
|
109
|
-
|
110
|
-
puts " - [#{i}] data cell - cats: #{cats.size}, cats_data: #{cats_data.size} (cats_div_data: #{cats_div_data.size} / cats_span_data: #{cats_span_data.size})"
|
111
|
-
|
112
|
-
pairs = []
|
113
|
-
last_pair = nil
|
114
|
-
last_pair_data_count = 0
|
115
|
-
|
116
|
-
## loop over div blocks (might be .category or .category_data)
|
117
|
-
cell.children.each_with_index do |child,j|
|
118
|
-
unless child.element?
|
119
|
-
## puts " **** !!!! skipping non-element type >#{child.type}<:"
|
120
|
-
## puts child.to_s
|
121
|
-
next
|
122
|
-
end
|
123
|
-
unless child.name == 'div'
|
124
|
-
puts " **** !!! skipping non-div >#{child.name}<:"
|
125
|
-
puts child.to_s
|
126
|
-
next
|
127
|
-
end
|
128
|
-
|
129
|
-
### check if .category or .category_data
|
130
|
-
if child['class'] == 'category'
|
131
|
-
|
132
|
-
## collect text for category; exclude element w/ class.category_data
|
133
|
-
text = ""
|
134
|
-
child.children.each do |subchild|
|
135
|
-
text << subchild.text.strip unless subchild.element? && subchild['class'] == 'category_data'
|
136
|
-
end
|
137
|
-
|
138
|
-
value = child.css('span.category_data').text.strip
|
139
|
-
|
140
|
-
puts " -- category >>#{text}<<"
|
141
|
-
|
142
|
-
## start new pair
|
143
|
-
last_pair = [ text, value ]
|
144
|
-
last_pair_data_count = 0
|
145
|
-
pairs << last_pair
|
146
|
-
|
147
|
-
elsif child['class'] == 'category_data'
|
148
|
-
puts " -- category_data"
|
149
|
-
|
150
|
-
text = child.text.strip
|
151
|
-
|
152
|
-
if last_pair.nil?
|
153
|
-
## assume its the very first entry; use implied/auto-created category
|
154
|
-
last_pair = [ 'text', '' ]
|
155
|
-
last_pair_data_count = 0
|
156
|
-
pairs << last_pair
|
157
|
-
end
|
158
|
-
|
159
|
-
### first category_data element?
|
160
|
-
if last_pair_data_count == 0
|
161
|
-
if last_pair[1] == ''
|
162
|
-
last_pair[1] = text
|
163
|
-
else
|
164
|
-
last_pair[1] += " #{text}" ## append w/o separator
|
165
|
-
end
|
166
|
-
else
|
167
|
-
last_pair[1] += "; #{text}" ## append with separator
|
168
|
-
end
|
169
|
-
last_pair_data_count += 1
|
170
|
-
|
171
|
-
else
|
172
|
-
puts " **** !!! skipping div w/o category or category_data class:"
|
173
|
-
puts child.to_s
|
174
|
-
end
|
175
|
-
end
|
176
|
-
|
177
|
-
## pp pairs
|
178
|
-
|
179
|
-
## pairs to hash
|
180
|
-
pairs_hash = {}
|
181
|
-
pairs.each do |pair|
|
182
|
-
pairs_hash[ pair[0] ] = pair[1]
|
183
|
-
end
|
184
|
-
|
185
|
-
hash[ last_cat ] = pairs_hash
|
186
|
-
|
187
|
-
else
|
188
|
-
puts "#### !!!! unknown cell type (no field or data id found):"
|
189
|
-
puts cell.to_s
|
190
|
-
end
|
191
|
-
end # each cell
|
192
|
-
|
193
|
-
pp hash
|
194
|
-
end # method stats
|
195
|
-
|
196
|
-
|
197
|
-
def yyy_test_mx
|
198
|
-
page = Factbook::Page.new( 'mx' )
|
199
|
-
|
200
|
-
## print first 600 chars
|
201
|
-
pp page.html[0..600]
|
202
|
-
|
203
|
-
doc = page.doc
|
204
|
-
|
205
|
-
panels = doc.css( '.CollapsiblePanel' )
|
206
|
-
questions = doc.css( '.question' )
|
207
|
-
answers = doc.css( '.answer' )
|
208
|
-
|
209
|
-
puts "panels.size: #{panels.size}"
|
210
|
-
puts "questions.size: #{questions.size}"
|
211
|
-
puts "answers.size: #{answers.size}"
|
212
|
-
|
213
|
-
cats0 = panels[0].css( '.category' )
|
214
|
-
cats0_data = panels[0].css( '.category_data' )
|
215
|
-
|
216
|
-
puts "cats0.size: #{cats0.size}"
|
217
|
-
puts "cats0_data.size: #{cats0_data.size}"
|
218
|
-
|
219
|
-
cats1 = panels[1].css( '.category' )
|
220
|
-
cats1_data = panels[1].css( '.category_data' )
|
221
|
-
|
222
|
-
puts "cats1.size: #{cats1.size}"
|
223
|
-
puts "cats1_data.size: #{cats1_data.size}"
|
224
|
-
|
225
|
-
|
226
|
-
## fix: use cats -- add s
|
227
|
-
cat = doc.css( '#CollapsiblePanel1_Geo div.category' )
|
228
|
-
puts "cat.size: #{cat.size}"
|
229
|
-
|
230
|
-
catcheck = doc.css( '#CollapsiblePanel1_Geo .category' )
|
231
|
-
puts "catcheck.size: #{catcheck.size}"
|
232
|
-
|
233
|
-
catcheck2 = doc.css( '.category' )
|
234
|
-
puts "catcheck2.size: #{catcheck2.size}"
|
235
|
-
|
236
|
-
|
237
|
-
catdata = doc.css( '#CollapsiblePanel1_Geo .category_data' )
|
238
|
-
puts "catdata.size: #{catdata.size}"
|
239
|
-
|
240
|
-
catdatacheck2 = doc.css( '.category_data' )
|
241
|
-
puts "catdatacheck2.size: #{catdatacheck2.size}"
|
242
|
-
|
243
|
-
puts "catdata[0]:"
|
244
|
-
pp catdata[0]
|
245
|
-
|
246
|
-
puts "catdata[1]:"
|
247
|
-
pp catdata[1]
|
248
|
-
|
249
|
-
# puts "catdata[2]:"
|
250
|
-
# pp catdata[2]
|
251
|
-
|
252
|
-
# puts "catdata[0].text():"
|
253
|
-
# pp catdata[0].text()
|
254
|
-
|
255
|
-
# puts "cat[0].text():"
|
256
|
-
# pp cat[0].text()
|
257
|
-
|
258
|
-
# cat.each_with_index do |c,i|
|
259
|
-
# puts "[#{i+1}]: ========================="
|
260
|
-
# puts ">>#{c.text()}<<"
|
261
|
-
# end
|
262
|
-
|
263
|
-
end
|
264
|
-
|
265
|
-
def yyy_test_mx
|
266
|
-
page = Factbook::Page.new( 'mx' )
|
267
|
-
|
268
|
-
## print first 600 chars
|
269
|
-
pp page.html[0..600]
|
270
|
-
|
271
|
-
## save for debuging
|
272
|
-
|
273
|
-
Dir.mkdir( 'tmp' ) unless Dir.exists?( 'tmp' )
|
274
|
-
puts "saving a copy to mx.html for debugging"
|
275
|
-
File.open( 'tmp/mx.html', 'w') do |f|
|
276
|
-
f.write( page.html )
|
277
|
-
end
|
278
|
-
|
279
|
-
doc = page.doc
|
280
|
-
sects = page.sects
|
281
|
-
|
282
|
-
panels = doc.css( '.CollapsiblePanel' )
|
283
|
-
questions = doc.css( '.question' )
|
284
|
-
answers = doc.css( '.answer' )
|
285
|
-
|
286
|
-
puts "panels.size: #{panels.size}"
|
287
|
-
puts "questions.size: #{questions.size}"
|
288
|
-
puts "answers.size: #{answers.size}"
|
289
|
-
|
290
|
-
rows_total = 0
|
291
|
-
panels.each_with_index do |panel,i|
|
292
|
-
rows = panel.css( 'table tr' )
|
293
|
-
puts " [#{i}] rows.size: #{rows.size}"
|
294
|
-
rows_total += rows.size
|
295
|
-
end
|
296
|
-
|
297
|
-
puts "rows_total: #{rows_total}"
|
298
|
-
|
299
|
-
rows = doc.css( 'table tr' )
|
300
|
-
field_ids = rows.css( '#field' ) ## check - use div#field.category -- possible?
|
301
|
-
data_ids = rows.css( '#data' )
|
302
|
-
|
303
|
-
puts "rows.size: #{rows.size} (field_ids.size: #{field_ids.size} / data_ids.size: #{data_ids.size})"
|
304
|
-
|
305
|
-
cats = rows.css( '.category' )
|
306
|
-
cats_div = rows.css( 'div.category' )
|
307
|
-
cats_span = rows.css( 'span.category' )
|
308
|
-
cats_other_size = cats.size - cats_div.size - cats_span.size
|
309
|
-
|
310
|
-
cats_data = rows.css( '.category_data' )
|
311
|
-
cats_div_data = rows.css( 'div.category_data' )
|
312
|
-
cats_span_data = rows.css( 'span.category_data' )
|
313
|
-
cats_other_data_size = cats_data.size - cats_div_data.size - cats_span_data.size
|
314
|
-
|
315
|
-
puts "cats.size: #{cats.size} (cats_div.size #{cats_div.size} / cats_span.size #{cats_span.size} / cats_other.size #{cats_other_size})"
|
316
|
-
puts "cats_data.size: #{cats_data.size} (cats_div_data.size #{cats_div_data.size} / cats_span_data.size #{cats_span_data.size} / cats_other_data.size #{cats_other_data_size})"
|
317
|
-
|
318
|
-
## some check for structure
|
319
|
-
if cats_other_size > 0
|
320
|
-
puts " ****!!!! category other (not div/span) found - #{cats_other_size}"
|
321
|
-
end
|
322
|
-
|
323
|
-
if cats_other_data_size > 0
|
324
|
-
puts " ****!!!! category_data other (not div/span) found - #{cats_other_data_size}"
|
325
|
-
end
|
326
|
-
|
327
|
-
## stats( doc )
|
328
|
-
|
329
|
-
sects.each_with_index do |sect,i|
|
330
|
-
puts ''
|
331
|
-
puts "############################"
|
332
|
-
puts "#### stats sect #{i}:"
|
333
|
-
stats( sect )
|
334
|
-
end
|
335
|
-
end
|
336
|
-
|
337
|
-
|
338
|
-
def yyy_stats( doc )
|
339
|
-
rows = doc.css( 'table tr' )
|
340
|
-
cells = doc.css( 'table tr td' )
|
341
|
-
field_ids = rows.css( '#field' ) ## check - use div#field.category -- possible?
|
342
|
-
data_ids = rows.css( '#data' )
|
343
|
-
|
344
|
-
puts "rows.size: #{rows.size} (cells.size: #{cells.size} / field_ids.size: #{field_ids.size} / data_ids.size: #{data_ids.size})"
|
345
|
-
|
346
|
-
|
347
|
-
## check rows
|
348
|
-
## todo/fix:
|
349
|
-
## loop over td's !!!
|
350
|
-
|
351
|
-
cells.each_with_index do |cell,i|
|
352
|
-
## next if i > 14 ## skip after xx for debugging for now
|
353
|
-
|
354
|
-
cats = cell.css( 'div.category' ) ## note: ignore all .category not using div (issue warn/err if found!!) etc.
|
355
|
-
cats_data = cell.css( 'div.category_data,span.category_data' ) ## note: ignore a.category_data etc.
|
356
|
-
cats_div_data = cell.css( 'div.category_data' )
|
357
|
-
cats_span_data = cell.css( 'span.category_data' )
|
358
|
-
|
359
|
-
field_ids = cell.css( '#field' ) ## td div.field check - use div#field.category -- possible?
|
360
|
-
|
361
|
-
### fix: split into #field and #data
|
362
|
-
## field has no category-data no sub/multiple categories etc.
|
363
|
-
|
364
|
-
## td#data
|
365
|
-
# quick hack: use parent() - fix!! check id for element if present and is data how?? e.g. cell['id'] == 'data' ???
|
366
|
-
data_ids = cell.parent.css( '#data' ) ## will include self? e.g. td id='data' ???
|
367
|
-
|
368
|
-
ids_size = field_ids.size + data_ids.size
|
369
|
-
|
370
|
-
if ids_size == 0
|
371
|
-
puts " ****!!!! no ids (field/data) found"
|
372
|
-
end
|
373
|
-
|
374
|
-
if ids_size > 1
|
375
|
-
puts " ***!!! more than one id (field/data) found - #{ids_size}"
|
376
|
-
end
|
377
|
-
|
378
|
-
|
379
|
-
## check for subcategory
|
380
|
-
## must be div w/ id field and class category
|
381
|
-
|
382
|
-
if field_ids.size == 1 ## assume category
|
383
|
-
|
384
|
-
if cats.size == 1 && cats_data.size == 0 && cats.first.name == 'div'
|
385
|
-
text = cats.first.text.strip # remove/strip leading and trailing spaces
|
386
|
-
puts " [#{i}] category: >>#{text}<<"
|
387
|
-
else
|
388
|
-
puts "**** !!!!!! warn/err - found element w/ field id (no match for subsection!!! - check)"
|
389
|
-
end
|
390
|
-
|
391
|
-
elsif data_ids.size == 1
|
392
|
-
|
393
|
-
if cats.size == 0
|
394
|
-
if cats_data.size == 1 ## check for cats_data.first.name == 'div' too ???
|
395
|
-
text = cats_data.first.text.strip # remove/strip leading and trailing spaces
|
396
|
-
puts " - [#{i}] data: >>#{text}<<"
|
397
|
-
elsif cats_data.size > 1 ## check for cats_data.first.name == 'div' too ???
|
398
|
-
ary = []
|
399
|
-
cats_data.each do |cat_data|
|
400
|
-
ary << cat_data.text.strip
|
401
|
-
end
|
402
|
-
text = ary.join( '; ' )
|
403
|
-
puts " - [#{i}] data#{cats_data.size}: >>#{text}<<"
|
404
|
-
else
|
405
|
-
# should not happen
|
406
|
-
puts "*** !!!! warn/err - skip empty data cell (no cats/no cats_data)"
|
407
|
-
end
|
408
|
-
elsif cats.size > 0
|
409
|
-
puts " [#{i}] cats: #{cats.size}, cats_data: #{cats_data.size} (cats_div_data: #{cats_div_data.size}/ cats_span_data: #{cats_span_data.size})"
|
410
|
-
|
411
|
-
|
412
|
-
## check for "free standing" data blocks (not assigned to category/key)
|
413
|
-
if cats_div_data.size > 1
|
414
|
-
if cats_div_data.size == 1 #
|
415
|
-
# check if first or last entry (if first entry use key *text*; otherwise use key *notes*)
|
416
|
-
else ## multiple (more than one) data divs
|
417
|
-
if cats.size == 1
|
418
|
-
# always assume text for now (not *notes*)
|
419
|
-
else
|
420
|
-
# multiple cats and multiple data divs (e.g. drinking water source:)
|
421
|
-
# to be done - for now use one all-in-one text blob
|
422
|
-
end
|
423
|
-
end
|
424
|
-
end
|
425
|
-
|
426
|
-
cats.each_with_index do |cat,j| # note: use index - j (for inner loop)
|
427
|
-
## get text from direct child / children
|
428
|
-
## do NOT included text from nested span - how? possible?
|
429
|
-
## text = cat.css( ':not( .category_data )' ).text.strip ## will it include text node(s)??
|
430
|
-
## text = cat.text.strip ## will it include text node(s)??
|
431
|
-
## text = cat.css( '*:not(.category_data)' ).text.strip
|
432
|
-
# Find the content of all child text nodes and join them together
|
433
|
-
|
434
|
-
## collect text for category; exclude element w/ class.category_data
|
435
|
-
text = ""
|
436
|
-
cat.children.each do |child|
|
437
|
-
text << child.text.strip unless child.element? && child['class'] == 'category_data'
|
438
|
-
end
|
439
|
-
|
440
|
-
## text = cat.xpath('text()').text.strip
|
441
|
-
|
442
|
-
n = cat.css( '.category_data' )
|
443
|
-
## or use
|
444
|
-
## text = cat.children.first.text ??
|
445
|
-
puts " -- [#{j}] subcategory: >>#{text}<< cats_data: #{n.size}"
|
446
|
-
## pp cat.css( '*:not(.category_data)' )
|
447
|
-
## pp cat.css( "*:not(*[@class='category_data'])" ) # *[@class='someclass']
|
448
|
-
## pp cat
|
449
|
-
## check if is div - if not issue warn
|
450
|
-
if cat.name == 'div'
|
451
|
-
## check if includes one or more category_data nodes
|
452
|
-
if n.size == 0
|
453
|
-
puts " ****** !!! no category_data inside"
|
454
|
-
end
|
455
|
-
if n.size > 1
|
456
|
-
puts " ****** !!! multiple category_data's inside - #{n.size}"
|
457
|
-
end
|
458
|
-
else
|
459
|
-
puts " ****** !!!! no div - is >>#{cat.name}<<"
|
460
|
-
end
|
461
|
-
end
|
462
|
-
else
|
463
|
-
puts "**** !!!!!! warn/err - found element w/ data id (no cats, no cats-data) [#{i}] cats: #{cats.size}, cats_data: #{cats_data.size}, data_ids: #{data_ids.size}"
|
464
|
-
end
|
465
|
-
else
|
466
|
-
puts "**** !!!!!!! [#{i}] cats: #{cats.size}, cats_data: #{cats_data.size}, field_ids: #{field_ids.size}, data_ids: #{data_ids.size}"
|
467
|
-
end
|
468
|
-
|
469
|
-
|
470
|
-
if cats.size > 1
|
471
|
-
## puts cell.to_s
|
472
|
-
end
|
473
|
-
end # each cell
|
474
|
-
|
475
|
-
end
|
476
|
-
|
477
|
-
|
478
|
-
end # class TestPageOld
|
data/test/test_strip.rb
DELETED
@@ -1,66 +0,0 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
|
-
|
4
|
-
require 'helper'
|
5
|
-
|
6
|
-
|
7
|
-
class TestStrip < MiniTest::Unit::TestCase
|
8
|
-
|
9
|
-
def test_country_comparison
|
10
|
-
|
11
|
-
html=<<EOS
|
12
|
-
|
13
|
-
<span class="category" style="padding-left:7px;">country comparison to the world:</span> <span class="category_data"> <a href="../rankorder/2147rank.html?countryname=Brazil&countrycode=br®ionCode=soa&rank=5#br" onMouseDown="" title="Country comparison to the world" alt="Country comparison to the world"> 5 </a> </span>
|
14
|
-
|
15
|
-
EOS
|
16
|
-
|
17
|
-
## note: need to escapce space!!!! e.g. use to\s the\s world etc.
|
18
|
-
## Note: To match whitespace in an x pattern use an escape such as \s or \p{Space}.
|
19
|
-
|
20
|
-
country_comparison_regex = /
|
21
|
-
<span \s class="category"[^>]*>
|
22
|
-
country \s comparison \s to \s the \s world:
|
23
|
-
<\/span>
|
24
|
-
\s*
|
25
|
-
<span \s class="category_data"[^>]*>
|
26
|
-
\s*
|
27
|
-
<a \s [^>]+>
|
28
|
-
.+?
|
29
|
-
<\/a>
|
30
|
-
\s*
|
31
|
-
<\/span>
|
32
|
-
/xm
|
33
|
-
|
34
|
-
country_comparison_space_regex = /
|
35
|
-
country \s comparison \s to \s the \s world:
|
36
|
-
/xm
|
37
|
-
|
38
|
-
country_comparison_span_regex = /
|
39
|
-
<span \s class="category"[^>]*>
|
40
|
-
/xm
|
41
|
-
|
42
|
-
country_comparison_cat_regex = /
|
43
|
-
<span \s class="category"[^>]*>
|
44
|
-
country \s comparison \s to \s the \s world:
|
45
|
-
<\/span>
|
46
|
-
/xm
|
47
|
-
|
48
|
-
|
49
|
-
m = country_comparison_space_regex.match( html )
|
50
|
-
pp m
|
51
|
-
assert m # must find a match
|
52
|
-
|
53
|
-
m = country_comparison_span_regex.match( html )
|
54
|
-
pp m
|
55
|
-
assert m # must find a match
|
56
|
-
|
57
|
-
m = country_comparison_cat_regex.match( html )
|
58
|
-
pp m
|
59
|
-
assert m # must find a match
|
60
|
-
|
61
|
-
m = country_comparison_regex.match( html )
|
62
|
-
pp m
|
63
|
-
assert m # must find a match
|
64
|
-
end
|
65
|
-
|
66
|
-
end # class TestStrip
|