factbook 0.1.3 → 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (46) hide show
  1. checksums.yaml +7 -0
  2. data/Manifest.txt +34 -22
  3. data/README.md +8 -3
  4. data/Rakefile +2 -263
  5. data/data/codes.csv +262 -0
  6. data/data/comparisons.csv +75 -0
  7. data/lib/factbook/builder.rb +214 -0
  8. data/lib/factbook/builder_item.rb +93 -0
  9. data/lib/factbook/codes.rb +119 -0
  10. data/lib/factbook/comparisons.rb +50 -0
  11. data/lib/factbook/page.rb +103 -303
  12. data/lib/factbook/sanitizer.rb +214 -0
  13. data/lib/factbook/sect.rb +29 -196
  14. data/lib/factbook/subsect.rb +18 -0
  15. data/lib/factbook/table.rb +52 -0
  16. data/lib/factbook/utils.rb +85 -0
  17. data/lib/factbook/utils_info.rb +102 -0
  18. data/lib/factbook/version.rb +4 -3
  19. data/lib/factbook.rb +23 -1
  20. data/test/data/au.html +579 -0
  21. data/test/data/au.yml +8 -0
  22. data/test/data/be.html +596 -0
  23. data/test/data/be.yml +8 -0
  24. data/test/data/src/au.html +2006 -0
  25. data/test/data/src/be.html +2011 -0
  26. data/test/helper.rb +0 -4
  27. data/test/test_builder.rb +37 -0
  28. data/test/test_codes.rb +76 -0
  29. data/test/test_comparisons.rb +19 -0
  30. data/test/test_fields.rb +21 -18
  31. data/test/test_item_builder.rb +99 -0
  32. data/test/test_json.rb +17 -20
  33. data/test/test_page.rb +18 -10
  34. data/test/test_sanitizer.rb +35 -0
  35. metadata +68 -49
  36. data/.gemtest +0 -0
  37. data/test/data/countrytemplate_au.html +0 -4179
  38. data/test/data/countrytemplate_be.html +0 -4260
  39. data/test/data/countrytemplate_br.html +0 -4366
  40. data/test/data/countrytemplate_ee.html +0 -2999
  41. data/test/data/countrytemplate_ls.html +0 -2728
  42. data/test/data/countrytemplate_mx.html +0 -4397
  43. data/test/data/countrytemplate_vt.html +0 -1726
  44. data/test/data/countrytemplate_xx.html +0 -2898
  45. data/test/test_page_old.rb +0 -478
  46. data/test/test_strip.rb +0 -66
@@ -1,478 +0,0 @@
1
- # encoding: utf-8
2
-
3
-
4
- require 'helper'
5
-
6
-
7
- class TestPageOld < MiniTest::Unit::TestCase
8
-
9
-
10
- def xxx_test_br
11
- Dir.mkdir( 'tmp' ) unless Dir.exists?( 'tmp' )
12
-
13
- page = Factbook::Page.new( 'br' )
14
-
15
- ## print first 600 chars
16
- pp page.html[0..600]
17
-
18
- ## save for debuging
19
-
20
- Dir.mkdir( 'tmp' ) unless Dir.exists?( 'tmp' )
21
- puts "saving a copy to br.html for debugging"
22
- File.open( 'tmp/br.html', 'w') do |f|
23
- f.write( page.html )
24
- end
25
-
26
- doc = page.doc
27
- sects = page.sects
28
-
29
- rows = doc.css( 'table tr' )
30
- field_ids = rows.css( '#field' ) ## check - use div#field.category -- possible?
31
- data_ids = rows.css( '#data' )
32
-
33
- puts "rows.size: #{rows.size} (field_ids.size: #{field_ids.size} / data_ids.size: #{data_ids.size})"
34
-
35
- cats = rows.css( '.category' )
36
- cats_div = rows.css( 'div.category' )
37
- cats_span = rows.css( 'span.category' )
38
- cats_other_size = cats.size - cats_div.size - cats_span.size
39
-
40
- cats_data = rows.css( '.category_data' )
41
- cats_div_data = rows.css( 'div.category_data' )
42
- cats_span_data = rows.css( 'span.category_data' )
43
- cats_other_data_size = cats_data.size - cats_div_data.size - cats_span_data.size
44
-
45
- puts "cats.size: #{cats.size} (cats_div.size #{cats_div.size} / cats_span.size #{cats_span.size} / cats_other.size #{cats_other_size})"
46
- puts "cats_data.size: #{cats_data.size} (cats_div_data.size #{cats_div_data.size} / cats_span_data.size #{cats_span_data.size} / cats_other_data.size #{cats_other_data_size})"
47
-
48
- ## some check for structure
49
- if cats_other_size > 0
50
- puts " ****!!!! category other (not div/span) found - #{cats_other_size}"
51
- end
52
-
53
- if cats_other_data_size > 0
54
- puts " ****!!!! category_data other (not div/span) found - #{cats_other_data_size}"
55
- end
56
-
57
- ## stats( doc )
58
-
59
- sects.each_with_index do |sect,i|
60
- puts ''
61
- puts "############################"
62
- puts "#### stats sect #{i}:"
63
- pp page.sect_to_hash( sect )
64
- end
65
- end
66
-
67
-
68
- def xxx_stats( doc )
69
- rows = doc.css( 'table tr' )
70
- cells = doc.css( 'table tr td' )
71
- field_ids = rows.css( '#field' ) ## check - use div#field.category -- possible?
72
- data_ids = rows.css( '#data' )
73
-
74
- puts "rows.size: #{rows.size} (cells.size: #{cells.size} / field_ids.size: #{field_ids.size} / data_ids.size: #{data_ids.size})"
75
-
76
- hash = {}
77
- last_cat = nil
78
-
79
-
80
- cells.each_with_index do |cell,i|
81
- ## next if i > 14 ## skip after xx for debugging for now
82
-
83
- # check if field or data id
84
-
85
- # check for (nested) div#field in td
86
- has_field_id = cell.css( '#field' ).size == 1 ? true : false
87
-
88
- # check for td#data
89
- has_data_id = cell['id'] == 'data' ? true : false
90
-
91
- if has_field_id
92
-
93
- cats = cell.css( 'div.category' ) ## note: ignore all .category not using div (issue warn/err if found!!) etc.
94
- if cats.size == 1
95
- text = cats.first.text.strip # remove/strip leading and trailing spaces
96
- last_cat = text
97
- puts " [#{i}] category: >>#{text}<<"
98
- else
99
- puts "**** !!!!!! warn/err - found element w/ field id (no match for subsection!!! - check)"
100
- puts cell.to_s
101
- end
102
-
103
- elsif has_data_id
104
-
105
- cats = cell.css( 'div.category' ) ## note: ignore all .category not using div (issue warn/err if found!!) etc.
106
- cats_data = cell.css( 'div.category_data,span.category_data' ) ## note: ignore a.category_data etc.
107
- cats_div_data = cell.css( 'div.category_data' )
108
- cats_span_data = cell.css( 'span.category_data' )
109
-
110
- puts " - [#{i}] data cell - cats: #{cats.size}, cats_data: #{cats_data.size} (cats_div_data: #{cats_div_data.size} / cats_span_data: #{cats_span_data.size})"
111
-
112
- pairs = []
113
- last_pair = nil
114
- last_pair_data_count = 0
115
-
116
- ## loop over div blocks (might be .category or .category_data)
117
- cell.children.each_with_index do |child,j|
118
- unless child.element?
119
- ## puts " **** !!!! skipping non-element type >#{child.type}<:"
120
- ## puts child.to_s
121
- next
122
- end
123
- unless child.name == 'div'
124
- puts " **** !!! skipping non-div >#{child.name}<:"
125
- puts child.to_s
126
- next
127
- end
128
-
129
- ### check if .category or .category_data
130
- if child['class'] == 'category'
131
-
132
- ## collect text for category; exclude element w/ class.category_data
133
- text = ""
134
- child.children.each do |subchild|
135
- text << subchild.text.strip unless subchild.element? && subchild['class'] == 'category_data'
136
- end
137
-
138
- value = child.css('span.category_data').text.strip
139
-
140
- puts " -- category >>#{text}<<"
141
-
142
- ## start new pair
143
- last_pair = [ text, value ]
144
- last_pair_data_count = 0
145
- pairs << last_pair
146
-
147
- elsif child['class'] == 'category_data'
148
- puts " -- category_data"
149
-
150
- text = child.text.strip
151
-
152
- if last_pair.nil?
153
- ## assume its the very first entry; use implied/auto-created category
154
- last_pair = [ 'text', '' ]
155
- last_pair_data_count = 0
156
- pairs << last_pair
157
- end
158
-
159
- ### first category_data element?
160
- if last_pair_data_count == 0
161
- if last_pair[1] == ''
162
- last_pair[1] = text
163
- else
164
- last_pair[1] += " #{text}" ## append w/o separator
165
- end
166
- else
167
- last_pair[1] += "; #{text}" ## append with separator
168
- end
169
- last_pair_data_count += 1
170
-
171
- else
172
- puts " **** !!! skipping div w/o category or category_data class:"
173
- puts child.to_s
174
- end
175
- end
176
-
177
- ## pp pairs
178
-
179
- ## pairs to hash
180
- pairs_hash = {}
181
- pairs.each do |pair|
182
- pairs_hash[ pair[0] ] = pair[1]
183
- end
184
-
185
- hash[ last_cat ] = pairs_hash
186
-
187
- else
188
- puts "#### !!!! unknown cell type (no field or data id found):"
189
- puts cell.to_s
190
- end
191
- end # each cell
192
-
193
- pp hash
194
- end # method stats
195
-
196
-
197
- def yyy_test_mx
198
- page = Factbook::Page.new( 'mx' )
199
-
200
- ## print first 600 chars
201
- pp page.html[0..600]
202
-
203
- doc = page.doc
204
-
205
- panels = doc.css( '.CollapsiblePanel' )
206
- questions = doc.css( '.question' )
207
- answers = doc.css( '.answer' )
208
-
209
- puts "panels.size: #{panels.size}"
210
- puts "questions.size: #{questions.size}"
211
- puts "answers.size: #{answers.size}"
212
-
213
- cats0 = panels[0].css( '.category' )
214
- cats0_data = panels[0].css( '.category_data' )
215
-
216
- puts "cats0.size: #{cats0.size}"
217
- puts "cats0_data.size: #{cats0_data.size}"
218
-
219
- cats1 = panels[1].css( '.category' )
220
- cats1_data = panels[1].css( '.category_data' )
221
-
222
- puts "cats1.size: #{cats1.size}"
223
- puts "cats1_data.size: #{cats1_data.size}"
224
-
225
-
226
- ## fix: use cats -- add s
227
- cat = doc.css( '#CollapsiblePanel1_Geo div.category' )
228
- puts "cat.size: #{cat.size}"
229
-
230
- catcheck = doc.css( '#CollapsiblePanel1_Geo .category' )
231
- puts "catcheck.size: #{catcheck.size}"
232
-
233
- catcheck2 = doc.css( '.category' )
234
- puts "catcheck2.size: #{catcheck2.size}"
235
-
236
-
237
- catdata = doc.css( '#CollapsiblePanel1_Geo .category_data' )
238
- puts "catdata.size: #{catdata.size}"
239
-
240
- catdatacheck2 = doc.css( '.category_data' )
241
- puts "catdatacheck2.size: #{catdatacheck2.size}"
242
-
243
- puts "catdata[0]:"
244
- pp catdata[0]
245
-
246
- puts "catdata[1]:"
247
- pp catdata[1]
248
-
249
- # puts "catdata[2]:"
250
- # pp catdata[2]
251
-
252
- # puts "catdata[0].text():"
253
- # pp catdata[0].text()
254
-
255
- # puts "cat[0].text():"
256
- # pp cat[0].text()
257
-
258
- # cat.each_with_index do |c,i|
259
- # puts "[#{i+1}]: ========================="
260
- # puts ">>#{c.text()}<<"
261
- # end
262
-
263
- end
264
-
265
- def yyy_test_mx
266
- page = Factbook::Page.new( 'mx' )
267
-
268
- ## print first 600 chars
269
- pp page.html[0..600]
270
-
271
- ## save for debuging
272
-
273
- Dir.mkdir( 'tmp' ) unless Dir.exists?( 'tmp' )
274
- puts "saving a copy to mx.html for debugging"
275
- File.open( 'tmp/mx.html', 'w') do |f|
276
- f.write( page.html )
277
- end
278
-
279
- doc = page.doc
280
- sects = page.sects
281
-
282
- panels = doc.css( '.CollapsiblePanel' )
283
- questions = doc.css( '.question' )
284
- answers = doc.css( '.answer' )
285
-
286
- puts "panels.size: #{panels.size}"
287
- puts "questions.size: #{questions.size}"
288
- puts "answers.size: #{answers.size}"
289
-
290
- rows_total = 0
291
- panels.each_with_index do |panel,i|
292
- rows = panel.css( 'table tr' )
293
- puts " [#{i}] rows.size: #{rows.size}"
294
- rows_total += rows.size
295
- end
296
-
297
- puts "rows_total: #{rows_total}"
298
-
299
- rows = doc.css( 'table tr' )
300
- field_ids = rows.css( '#field' ) ## check - use div#field.category -- possible?
301
- data_ids = rows.css( '#data' )
302
-
303
- puts "rows.size: #{rows.size} (field_ids.size: #{field_ids.size} / data_ids.size: #{data_ids.size})"
304
-
305
- cats = rows.css( '.category' )
306
- cats_div = rows.css( 'div.category' )
307
- cats_span = rows.css( 'span.category' )
308
- cats_other_size = cats.size - cats_div.size - cats_span.size
309
-
310
- cats_data = rows.css( '.category_data' )
311
- cats_div_data = rows.css( 'div.category_data' )
312
- cats_span_data = rows.css( 'span.category_data' )
313
- cats_other_data_size = cats_data.size - cats_div_data.size - cats_span_data.size
314
-
315
- puts "cats.size: #{cats.size} (cats_div.size #{cats_div.size} / cats_span.size #{cats_span.size} / cats_other.size #{cats_other_size})"
316
- puts "cats_data.size: #{cats_data.size} (cats_div_data.size #{cats_div_data.size} / cats_span_data.size #{cats_span_data.size} / cats_other_data.size #{cats_other_data_size})"
317
-
318
- ## some check for structure
319
- if cats_other_size > 0
320
- puts " ****!!!! category other (not div/span) found - #{cats_other_size}"
321
- end
322
-
323
- if cats_other_data_size > 0
324
- puts " ****!!!! category_data other (not div/span) found - #{cats_other_data_size}"
325
- end
326
-
327
- ## stats( doc )
328
-
329
- sects.each_with_index do |sect,i|
330
- puts ''
331
- puts "############################"
332
- puts "#### stats sect #{i}:"
333
- stats( sect )
334
- end
335
- end
336
-
337
-
338
- def yyy_stats( doc )
339
- rows = doc.css( 'table tr' )
340
- cells = doc.css( 'table tr td' )
341
- field_ids = rows.css( '#field' ) ## check - use div#field.category -- possible?
342
- data_ids = rows.css( '#data' )
343
-
344
- puts "rows.size: #{rows.size} (cells.size: #{cells.size} / field_ids.size: #{field_ids.size} / data_ids.size: #{data_ids.size})"
345
-
346
-
347
- ## check rows
348
- ## todo/fix:
349
- ## loop over td's !!!
350
-
351
- cells.each_with_index do |cell,i|
352
- ## next if i > 14 ## skip after xx for debugging for now
353
-
354
- cats = cell.css( 'div.category' ) ## note: ignore all .category not using div (issue warn/err if found!!) etc.
355
- cats_data = cell.css( 'div.category_data,span.category_data' ) ## note: ignore a.category_data etc.
356
- cats_div_data = cell.css( 'div.category_data' )
357
- cats_span_data = cell.css( 'span.category_data' )
358
-
359
- field_ids = cell.css( '#field' ) ## td div.field check - use div#field.category -- possible?
360
-
361
- ### fix: split into #field and #data
362
- ## field has no category-data no sub/multiple categories etc.
363
-
364
- ## td#data
365
- # quick hack: use parent() - fix!! check id for element if present and is data how?? e.g. cell['id'] == 'data' ???
366
- data_ids = cell.parent.css( '#data' ) ## will include self? e.g. td id='data' ???
367
-
368
- ids_size = field_ids.size + data_ids.size
369
-
370
- if ids_size == 0
371
- puts " ****!!!! no ids (field/data) found"
372
- end
373
-
374
- if ids_size > 1
375
- puts " ***!!! more than one id (field/data) found - #{ids_size}"
376
- end
377
-
378
-
379
- ## check for subcategory
380
- ## must be div w/ id field and class category
381
-
382
- if field_ids.size == 1 ## assume category
383
-
384
- if cats.size == 1 && cats_data.size == 0 && cats.first.name == 'div'
385
- text = cats.first.text.strip # remove/strip leading and trailing spaces
386
- puts " [#{i}] category: >>#{text}<<"
387
- else
388
- puts "**** !!!!!! warn/err - found element w/ field id (no match for subsection!!! - check)"
389
- end
390
-
391
- elsif data_ids.size == 1
392
-
393
- if cats.size == 0
394
- if cats_data.size == 1 ## check for cats_data.first.name == 'div' too ???
395
- text = cats_data.first.text.strip # remove/strip leading and trailing spaces
396
- puts " - [#{i}] data: >>#{text}<<"
397
- elsif cats_data.size > 1 ## check for cats_data.first.name == 'div' too ???
398
- ary = []
399
- cats_data.each do |cat_data|
400
- ary << cat_data.text.strip
401
- end
402
- text = ary.join( '; ' )
403
- puts " - [#{i}] data#{cats_data.size}: >>#{text}<<"
404
- else
405
- # should not happen
406
- puts "*** !!!! warn/err - skip empty data cell (no cats/no cats_data)"
407
- end
408
- elsif cats.size > 0
409
- puts " [#{i}] cats: #{cats.size}, cats_data: #{cats_data.size} (cats_div_data: #{cats_div_data.size}/ cats_span_data: #{cats_span_data.size})"
410
-
411
-
412
- ## check for "free standing" data blocks (not assigned to category/key)
413
- if cats_div_data.size > 1
414
- if cats_div_data.size == 1 #
415
- # check if first or last entry (if first entry use key *text*; otherwise use key *notes*)
416
- else ## multiple (more than one) data divs
417
- if cats.size == 1
418
- # always assume text for now (not *notes*)
419
- else
420
- # multiple cats and multiple data divs (e.g. drinking water source:)
421
- # to be done - for now use one all-in-one text blob
422
- end
423
- end
424
- end
425
-
426
- cats.each_with_index do |cat,j| # note: use index - j (for inner loop)
427
- ## get text from direct child / children
428
- ## do NOT included text from nested span - how? possible?
429
- ## text = cat.css( ':not( .category_data )' ).text.strip ## will it include text node(s)??
430
- ## text = cat.text.strip ## will it include text node(s)??
431
- ## text = cat.css( '*:not(.category_data)' ).text.strip
432
- # Find the content of all child text nodes and join them together
433
-
434
- ## collect text for category; exclude element w/ class.category_data
435
- text = ""
436
- cat.children.each do |child|
437
- text << child.text.strip unless child.element? && child['class'] == 'category_data'
438
- end
439
-
440
- ## text = cat.xpath('text()').text.strip
441
-
442
- n = cat.css( '.category_data' )
443
- ## or use
444
- ## text = cat.children.first.text ??
445
- puts " -- [#{j}] subcategory: >>#{text}<< cats_data: #{n.size}"
446
- ## pp cat.css( '*:not(.category_data)' )
447
- ## pp cat.css( "*:not(*[@class='category_data'])" ) # *[@class='someclass']
448
- ## pp cat
449
- ## check if is div - if not issue warn
450
- if cat.name == 'div'
451
- ## check if includes one or more category_data nodes
452
- if n.size == 0
453
- puts " ****** !!! no category_data inside"
454
- end
455
- if n.size > 1
456
- puts " ****** !!! multiple category_data's inside - #{n.size}"
457
- end
458
- else
459
- puts " ****** !!!! no div - is >>#{cat.name}<<"
460
- end
461
- end
462
- else
463
- puts "**** !!!!!! warn/err - found element w/ data id (no cats, no cats-data) [#{i}] cats: #{cats.size}, cats_data: #{cats_data.size}, data_ids: #{data_ids.size}"
464
- end
465
- else
466
- puts "**** !!!!!!! [#{i}] cats: #{cats.size}, cats_data: #{cats_data.size}, field_ids: #{field_ids.size}, data_ids: #{data_ids.size}"
467
- end
468
-
469
-
470
- if cats.size > 1
471
- ## puts cell.to_s
472
- end
473
- end # each cell
474
-
475
- end
476
-
477
-
478
- end # class TestPageOld
data/test/test_strip.rb DELETED
@@ -1,66 +0,0 @@
1
- # encoding: utf-8
2
-
3
-
4
- require 'helper'
5
-
6
-
7
- class TestStrip < MiniTest::Unit::TestCase
8
-
9
- def test_country_comparison
10
-
11
- html=<<EOS
12
-
13
- <span class="category" style="padding-left:7px;">country comparison to the world:</span> <span class="category_data"> <a href="../rankorder/2147rank.html?countryname=Brazil&countrycode=br&regionCode=soa&rank=5#br" onMouseDown="" title="Country comparison to the world" alt="Country comparison to the world"> 5 </a> </span>
14
-
15
- EOS
16
-
17
- ## note: need to escapce space!!!! e.g. use to\s the\s world etc.
18
- ## Note: To match whitespace in an x pattern use an escape such as \s or \p{Space}.
19
-
20
- country_comparison_regex = /
21
- <span \s class="category"[^>]*>
22
- country \s comparison \s to \s the \s world:
23
- <\/span>
24
- \s*
25
- <span \s class="category_data"[^>]*>
26
- \s*
27
- <a \s [^>]+>
28
- .+?
29
- <\/a>
30
- \s*
31
- <\/span>
32
- /xm
33
-
34
- country_comparison_space_regex = /
35
- country \s comparison \s to \s the \s world:
36
- /xm
37
-
38
- country_comparison_span_regex = /
39
- <span \s class="category"[^>]*>
40
- /xm
41
-
42
- country_comparison_cat_regex = /
43
- <span \s class="category"[^>]*>
44
- country \s comparison \s to \s the \s world:
45
- <\/span>
46
- /xm
47
-
48
-
49
- m = country_comparison_space_regex.match( html )
50
- pp m
51
- assert m # must find a match
52
-
53
- m = country_comparison_span_regex.match( html )
54
- pp m
55
- assert m # must find a match
56
-
57
- m = country_comparison_cat_regex.match( html )
58
- pp m
59
- assert m # must find a match
60
-
61
- m = country_comparison_regex.match( html )
62
- pp m
63
- assert m # must find a match
64
- end
65
-
66
- end # class TestStrip