factbook 0.1.3 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. checksums.yaml +7 -0
  2. data/Manifest.txt +34 -22
  3. data/README.md +8 -3
  4. data/Rakefile +2 -263
  5. data/data/codes.csv +262 -0
  6. data/data/comparisons.csv +75 -0
  7. data/lib/factbook/builder.rb +214 -0
  8. data/lib/factbook/builder_item.rb +93 -0
  9. data/lib/factbook/codes.rb +119 -0
  10. data/lib/factbook/comparisons.rb +50 -0
  11. data/lib/factbook/page.rb +103 -303
  12. data/lib/factbook/sanitizer.rb +214 -0
  13. data/lib/factbook/sect.rb +29 -196
  14. data/lib/factbook/subsect.rb +18 -0
  15. data/lib/factbook/table.rb +52 -0
  16. data/lib/factbook/utils.rb +85 -0
  17. data/lib/factbook/utils_info.rb +102 -0
  18. data/lib/factbook/version.rb +4 -3
  19. data/lib/factbook.rb +23 -1
  20. data/test/data/au.html +579 -0
  21. data/test/data/au.yml +8 -0
  22. data/test/data/be.html +596 -0
  23. data/test/data/be.yml +8 -0
  24. data/test/data/src/au.html +2006 -0
  25. data/test/data/src/be.html +2011 -0
  26. data/test/helper.rb +0 -4
  27. data/test/test_builder.rb +37 -0
  28. data/test/test_codes.rb +76 -0
  29. data/test/test_comparisons.rb +19 -0
  30. data/test/test_fields.rb +21 -18
  31. data/test/test_item_builder.rb +99 -0
  32. data/test/test_json.rb +17 -20
  33. data/test/test_page.rb +18 -10
  34. data/test/test_sanitizer.rb +35 -0
  35. metadata +68 -49
  36. data/.gemtest +0 -0
  37. data/test/data/countrytemplate_au.html +0 -4179
  38. data/test/data/countrytemplate_be.html +0 -4260
  39. data/test/data/countrytemplate_br.html +0 -4366
  40. data/test/data/countrytemplate_ee.html +0 -2999
  41. data/test/data/countrytemplate_ls.html +0 -2728
  42. data/test/data/countrytemplate_mx.html +0 -4397
  43. data/test/data/countrytemplate_vt.html +0 -1726
  44. data/test/data/countrytemplate_xx.html +0 -2898
  45. data/test/test_page_old.rb +0 -478
  46. data/test/test_strip.rb +0 -66
@@ -1,478 +0,0 @@
1
- # encoding: utf-8
2
-
3
-
4
- require 'helper'
5
-
6
-
7
- class TestPageOld < MiniTest::Unit::TestCase
8
-
9
-
10
- def xxx_test_br
11
- Dir.mkdir( 'tmp' ) unless Dir.exists?( 'tmp' )
12
-
13
- page = Factbook::Page.new( 'br' )
14
-
15
- ## print first 600 chars
16
- pp page.html[0..600]
17
-
18
- ## save for debuging
19
-
20
- Dir.mkdir( 'tmp' ) unless Dir.exists?( 'tmp' )
21
- puts "saving a copy to br.html for debugging"
22
- File.open( 'tmp/br.html', 'w') do |f|
23
- f.write( page.html )
24
- end
25
-
26
- doc = page.doc
27
- sects = page.sects
28
-
29
- rows = doc.css( 'table tr' )
30
- field_ids = rows.css( '#field' ) ## check - use div#field.category -- possible?
31
- data_ids = rows.css( '#data' )
32
-
33
- puts "rows.size: #{rows.size} (field_ids.size: #{field_ids.size} / data_ids.size: #{data_ids.size})"
34
-
35
- cats = rows.css( '.category' )
36
- cats_div = rows.css( 'div.category' )
37
- cats_span = rows.css( 'span.category' )
38
- cats_other_size = cats.size - cats_div.size - cats_span.size
39
-
40
- cats_data = rows.css( '.category_data' )
41
- cats_div_data = rows.css( 'div.category_data' )
42
- cats_span_data = rows.css( 'span.category_data' )
43
- cats_other_data_size = cats_data.size - cats_div_data.size - cats_span_data.size
44
-
45
- puts "cats.size: #{cats.size} (cats_div.size #{cats_div.size} / cats_span.size #{cats_span.size} / cats_other.size #{cats_other_size})"
46
- puts "cats_data.size: #{cats_data.size} (cats_div_data.size #{cats_div_data.size} / cats_span_data.size #{cats_span_data.size} / cats_other_data.size #{cats_other_data_size})"
47
-
48
- ## some check for structure
49
- if cats_other_size > 0
50
- puts " ****!!!! category other (not div/span) found - #{cats_other_size}"
51
- end
52
-
53
- if cats_other_data_size > 0
54
- puts " ****!!!! category_data other (not div/span) found - #{cats_other_data_size}"
55
- end
56
-
57
- ## stats( doc )
58
-
59
- sects.each_with_index do |sect,i|
60
- puts ''
61
- puts "############################"
62
- puts "#### stats sect #{i}:"
63
- pp page.sect_to_hash( sect )
64
- end
65
- end
66
-
67
-
68
- def xxx_stats( doc )
69
- rows = doc.css( 'table tr' )
70
- cells = doc.css( 'table tr td' )
71
- field_ids = rows.css( '#field' ) ## check - use div#field.category -- possible?
72
- data_ids = rows.css( '#data' )
73
-
74
- puts "rows.size: #{rows.size} (cells.size: #{cells.size} / field_ids.size: #{field_ids.size} / data_ids.size: #{data_ids.size})"
75
-
76
- hash = {}
77
- last_cat = nil
78
-
79
-
80
- cells.each_with_index do |cell,i|
81
- ## next if i > 14 ## skip after xx for debugging for now
82
-
83
- # check if field or data id
84
-
85
- # check for (nested) div#field in td
86
- has_field_id = cell.css( '#field' ).size == 1 ? true : false
87
-
88
- # check for td#data
89
- has_data_id = cell['id'] == 'data' ? true : false
90
-
91
- if has_field_id
92
-
93
- cats = cell.css( 'div.category' ) ## note: ignore all .category not using div (issue warn/err if found!!) etc.
94
- if cats.size == 1
95
- text = cats.first.text.strip # remove/strip leading and trailing spaces
96
- last_cat = text
97
- puts " [#{i}] category: >>#{text}<<"
98
- else
99
- puts "**** !!!!!! warn/err - found element w/ field id (no match for subsection!!! - check)"
100
- puts cell.to_s
101
- end
102
-
103
- elsif has_data_id
104
-
105
- cats = cell.css( 'div.category' ) ## note: ignore all .category not using div (issue warn/err if found!!) etc.
106
- cats_data = cell.css( 'div.category_data,span.category_data' ) ## note: ignore a.category_data etc.
107
- cats_div_data = cell.css( 'div.category_data' )
108
- cats_span_data = cell.css( 'span.category_data' )
109
-
110
- puts " - [#{i}] data cell - cats: #{cats.size}, cats_data: #{cats_data.size} (cats_div_data: #{cats_div_data.size} / cats_span_data: #{cats_span_data.size})"
111
-
112
- pairs = []
113
- last_pair = nil
114
- last_pair_data_count = 0
115
-
116
- ## loop over div blocks (might be .category or .category_data)
117
- cell.children.each_with_index do |child,j|
118
- unless child.element?
119
- ## puts " **** !!!! skipping non-element type >#{child.type}<:"
120
- ## puts child.to_s
121
- next
122
- end
123
- unless child.name == 'div'
124
- puts " **** !!! skipping non-div >#{child.name}<:"
125
- puts child.to_s
126
- next
127
- end
128
-
129
- ### check if .category or .category_data
130
- if child['class'] == 'category'
131
-
132
- ## collect text for category; exclude element w/ class.category_data
133
- text = ""
134
- child.children.each do |subchild|
135
- text << subchild.text.strip unless subchild.element? && subchild['class'] == 'category_data'
136
- end
137
-
138
- value = child.css('span.category_data').text.strip
139
-
140
- puts " -- category >>#{text}<<"
141
-
142
- ## start new pair
143
- last_pair = [ text, value ]
144
- last_pair_data_count = 0
145
- pairs << last_pair
146
-
147
- elsif child['class'] == 'category_data'
148
- puts " -- category_data"
149
-
150
- text = child.text.strip
151
-
152
- if last_pair.nil?
153
- ## assume its the very first entry; use implied/auto-created category
154
- last_pair = [ 'text', '' ]
155
- last_pair_data_count = 0
156
- pairs << last_pair
157
- end
158
-
159
- ### first category_data element?
160
- if last_pair_data_count == 0
161
- if last_pair[1] == ''
162
- last_pair[1] = text
163
- else
164
- last_pair[1] += " #{text}" ## append w/o separator
165
- end
166
- else
167
- last_pair[1] += "; #{text}" ## append with separator
168
- end
169
- last_pair_data_count += 1
170
-
171
- else
172
- puts " **** !!! skipping div w/o category or category_data class:"
173
- puts child.to_s
174
- end
175
- end
176
-
177
- ## pp pairs
178
-
179
- ## pairs to hash
180
- pairs_hash = {}
181
- pairs.each do |pair|
182
- pairs_hash[ pair[0] ] = pair[1]
183
- end
184
-
185
- hash[ last_cat ] = pairs_hash
186
-
187
- else
188
- puts "#### !!!! unknown cell type (no field or data id found):"
189
- puts cell.to_s
190
- end
191
- end # each cell
192
-
193
- pp hash
194
- end # method stats
195
-
196
-
197
- def yyy_test_mx
198
- page = Factbook::Page.new( 'mx' )
199
-
200
- ## print first 600 chars
201
- pp page.html[0..600]
202
-
203
- doc = page.doc
204
-
205
- panels = doc.css( '.CollapsiblePanel' )
206
- questions = doc.css( '.question' )
207
- answers = doc.css( '.answer' )
208
-
209
- puts "panels.size: #{panels.size}"
210
- puts "questions.size: #{questions.size}"
211
- puts "answers.size: #{answers.size}"
212
-
213
- cats0 = panels[0].css( '.category' )
214
- cats0_data = panels[0].css( '.category_data' )
215
-
216
- puts "cats0.size: #{cats0.size}"
217
- puts "cats0_data.size: #{cats0_data.size}"
218
-
219
- cats1 = panels[1].css( '.category' )
220
- cats1_data = panels[1].css( '.category_data' )
221
-
222
- puts "cats1.size: #{cats1.size}"
223
- puts "cats1_data.size: #{cats1_data.size}"
224
-
225
-
226
- ## fix: use cats -- add s
227
- cat = doc.css( '#CollapsiblePanel1_Geo div.category' )
228
- puts "cat.size: #{cat.size}"
229
-
230
- catcheck = doc.css( '#CollapsiblePanel1_Geo .category' )
231
- puts "catcheck.size: #{catcheck.size}"
232
-
233
- catcheck2 = doc.css( '.category' )
234
- puts "catcheck2.size: #{catcheck2.size}"
235
-
236
-
237
- catdata = doc.css( '#CollapsiblePanel1_Geo .category_data' )
238
- puts "catdata.size: #{catdata.size}"
239
-
240
- catdatacheck2 = doc.css( '.category_data' )
241
- puts "catdatacheck2.size: #{catdatacheck2.size}"
242
-
243
- puts "catdata[0]:"
244
- pp catdata[0]
245
-
246
- puts "catdata[1]:"
247
- pp catdata[1]
248
-
249
- # puts "catdata[2]:"
250
- # pp catdata[2]
251
-
252
- # puts "catdata[0].text():"
253
- # pp catdata[0].text()
254
-
255
- # puts "cat[0].text():"
256
- # pp cat[0].text()
257
-
258
- # cat.each_with_index do |c,i|
259
- # puts "[#{i+1}]: ========================="
260
- # puts ">>#{c.text()}<<"
261
- # end
262
-
263
- end
264
-
265
- def yyy_test_mx
266
- page = Factbook::Page.new( 'mx' )
267
-
268
- ## print first 600 chars
269
- pp page.html[0..600]
270
-
271
- ## save for debuging
272
-
273
- Dir.mkdir( 'tmp' ) unless Dir.exists?( 'tmp' )
274
- puts "saving a copy to mx.html for debugging"
275
- File.open( 'tmp/mx.html', 'w') do |f|
276
- f.write( page.html )
277
- end
278
-
279
- doc = page.doc
280
- sects = page.sects
281
-
282
- panels = doc.css( '.CollapsiblePanel' )
283
- questions = doc.css( '.question' )
284
- answers = doc.css( '.answer' )
285
-
286
- puts "panels.size: #{panels.size}"
287
- puts "questions.size: #{questions.size}"
288
- puts "answers.size: #{answers.size}"
289
-
290
- rows_total = 0
291
- panels.each_with_index do |panel,i|
292
- rows = panel.css( 'table tr' )
293
- puts " [#{i}] rows.size: #{rows.size}"
294
- rows_total += rows.size
295
- end
296
-
297
- puts "rows_total: #{rows_total}"
298
-
299
- rows = doc.css( 'table tr' )
300
- field_ids = rows.css( '#field' ) ## check - use div#field.category -- possible?
301
- data_ids = rows.css( '#data' )
302
-
303
- puts "rows.size: #{rows.size} (field_ids.size: #{field_ids.size} / data_ids.size: #{data_ids.size})"
304
-
305
- cats = rows.css( '.category' )
306
- cats_div = rows.css( 'div.category' )
307
- cats_span = rows.css( 'span.category' )
308
- cats_other_size = cats.size - cats_div.size - cats_span.size
309
-
310
- cats_data = rows.css( '.category_data' )
311
- cats_div_data = rows.css( 'div.category_data' )
312
- cats_span_data = rows.css( 'span.category_data' )
313
- cats_other_data_size = cats_data.size - cats_div_data.size - cats_span_data.size
314
-
315
- puts "cats.size: #{cats.size} (cats_div.size #{cats_div.size} / cats_span.size #{cats_span.size} / cats_other.size #{cats_other_size})"
316
- puts "cats_data.size: #{cats_data.size} (cats_div_data.size #{cats_div_data.size} / cats_span_data.size #{cats_span_data.size} / cats_other_data.size #{cats_other_data_size})"
317
-
318
- ## some check for structure
319
- if cats_other_size > 0
320
- puts " ****!!!! category other (not div/span) found - #{cats_other_size}"
321
- end
322
-
323
- if cats_other_data_size > 0
324
- puts " ****!!!! category_data other (not div/span) found - #{cats_other_data_size}"
325
- end
326
-
327
- ## stats( doc )
328
-
329
- sects.each_with_index do |sect,i|
330
- puts ''
331
- puts "############################"
332
- puts "#### stats sect #{i}:"
333
- stats( sect )
334
- end
335
- end
336
-
337
-
338
- def yyy_stats( doc )
339
- rows = doc.css( 'table tr' )
340
- cells = doc.css( 'table tr td' )
341
- field_ids = rows.css( '#field' ) ## check - use div#field.category -- possible?
342
- data_ids = rows.css( '#data' )
343
-
344
- puts "rows.size: #{rows.size} (cells.size: #{cells.size} / field_ids.size: #{field_ids.size} / data_ids.size: #{data_ids.size})"
345
-
346
-
347
- ## check rows
348
- ## todo/fix:
349
- ## loop over td's !!!
350
-
351
- cells.each_with_index do |cell,i|
352
- ## next if i > 14 ## skip after xx for debugging for now
353
-
354
- cats = cell.css( 'div.category' ) ## note: ignore all .category not using div (issue warn/err if found!!) etc.
355
- cats_data = cell.css( 'div.category_data,span.category_data' ) ## note: ignore a.category_data etc.
356
- cats_div_data = cell.css( 'div.category_data' )
357
- cats_span_data = cell.css( 'span.category_data' )
358
-
359
- field_ids = cell.css( '#field' ) ## td div.field check - use div#field.category -- possible?
360
-
361
- ### fix: split into #field and #data
362
- ## field has no category-data no sub/multiple categories etc.
363
-
364
- ## td#data
365
- # quick hack: use parent() - fix!! check id for element if present and is data how?? e.g. cell['id'] == 'data' ???
366
- data_ids = cell.parent.css( '#data' ) ## will include self? e.g. td id='data' ???
367
-
368
- ids_size = field_ids.size + data_ids.size
369
-
370
- if ids_size == 0
371
- puts " ****!!!! no ids (field/data) found"
372
- end
373
-
374
- if ids_size > 1
375
- puts " ***!!! more than one id (field/data) found - #{ids_size}"
376
- end
377
-
378
-
379
- ## check for subcategory
380
- ## must be div w/ id field and class category
381
-
382
- if field_ids.size == 1 ## assume category
383
-
384
- if cats.size == 1 && cats_data.size == 0 && cats.first.name == 'div'
385
- text = cats.first.text.strip # remove/strip leading and trailing spaces
386
- puts " [#{i}] category: >>#{text}<<"
387
- else
388
- puts "**** !!!!!! warn/err - found element w/ field id (no match for subsection!!! - check)"
389
- end
390
-
391
- elsif data_ids.size == 1
392
-
393
- if cats.size == 0
394
- if cats_data.size == 1 ## check for cats_data.first.name == 'div' too ???
395
- text = cats_data.first.text.strip # remove/strip leading and trailing spaces
396
- puts " - [#{i}] data: >>#{text}<<"
397
- elsif cats_data.size > 1 ## check for cats_data.first.name == 'div' too ???
398
- ary = []
399
- cats_data.each do |cat_data|
400
- ary << cat_data.text.strip
401
- end
402
- text = ary.join( '; ' )
403
- puts " - [#{i}] data#{cats_data.size}: >>#{text}<<"
404
- else
405
- # should not happen
406
- puts "*** !!!! warn/err - skip empty data cell (no cats/no cats_data)"
407
- end
408
- elsif cats.size > 0
409
- puts " [#{i}] cats: #{cats.size}, cats_data: #{cats_data.size} (cats_div_data: #{cats_div_data.size}/ cats_span_data: #{cats_span_data.size})"
410
-
411
-
412
- ## check for "free standing" data blocks (not assigned to category/key)
413
- if cats_div_data.size > 1
414
- if cats_div_data.size == 1 #
415
- # check if first or last entry (if first entry use key *text*; otherwise use key *notes*)
416
- else ## multiple (more than one) data divs
417
- if cats.size == 1
418
- # always assume text for now (not *notes*)
419
- else
420
- # multiple cats and multiple data divs (e.g. drinking water source:)
421
- # to be done - for now use one all-in-one text blob
422
- end
423
- end
424
- end
425
-
426
- cats.each_with_index do |cat,j| # note: use index - j (for inner loop)
427
- ## get text from direct child / children
428
- ## do NOT included text from nested span - how? possible?
429
- ## text = cat.css( ':not( .category_data )' ).text.strip ## will it include text node(s)??
430
- ## text = cat.text.strip ## will it include text node(s)??
431
- ## text = cat.css( '*:not(.category_data)' ).text.strip
432
- # Find the content of all child text nodes and join them together
433
-
434
- ## collect text for category; exclude element w/ class.category_data
435
- text = ""
436
- cat.children.each do |child|
437
- text << child.text.strip unless child.element? && child['class'] == 'category_data'
438
- end
439
-
440
- ## text = cat.xpath('text()').text.strip
441
-
442
- n = cat.css( '.category_data' )
443
- ## or use
444
- ## text = cat.children.first.text ??
445
- puts " -- [#{j}] subcategory: >>#{text}<< cats_data: #{n.size}"
446
- ## pp cat.css( '*:not(.category_data)' )
447
- ## pp cat.css( "*:not(*[@class='category_data'])" ) # *[@class='someclass']
448
- ## pp cat
449
- ## check if is div - if not issue warn
450
- if cat.name == 'div'
451
- ## check if includes one or more category_data nodes
452
- if n.size == 0
453
- puts " ****** !!! no category_data inside"
454
- end
455
- if n.size > 1
456
- puts " ****** !!! multiple category_data's inside - #{n.size}"
457
- end
458
- else
459
- puts " ****** !!!! no div - is >>#{cat.name}<<"
460
- end
461
- end
462
- else
463
- puts "**** !!!!!! warn/err - found element w/ data id (no cats, no cats-data) [#{i}] cats: #{cats.size}, cats_data: #{cats_data.size}, data_ids: #{data_ids.size}"
464
- end
465
- else
466
- puts "**** !!!!!!! [#{i}] cats: #{cats.size}, cats_data: #{cats_data.size}, field_ids: #{field_ids.size}, data_ids: #{data_ids.size}"
467
- end
468
-
469
-
470
- if cats.size > 1
471
- ## puts cell.to_s
472
- end
473
- end # each cell
474
-
475
- end
476
-
477
-
478
- end # class TestPageOld
data/test/test_strip.rb DELETED
@@ -1,66 +0,0 @@
1
- # encoding: utf-8
2
-
3
-
4
- require 'helper'
5
-
6
-
7
- class TestStrip < MiniTest::Unit::TestCase
8
-
9
- def test_country_comparison
10
-
11
- html=<<EOS
12
-
13
- <span class="category" style="padding-left:7px;">country comparison to the world:</span> <span class="category_data"> <a href="../rankorder/2147rank.html?countryname=Brazil&countrycode=br&regionCode=soa&rank=5#br" onMouseDown="" title="Country comparison to the world" alt="Country comparison to the world"> 5 </a> </span>
14
-
15
- EOS
16
-
17
- ## note: need to escapce space!!!! e.g. use to\s the\s world etc.
18
- ## Note: To match whitespace in an x pattern use an escape such as \s or \p{Space}.
19
-
20
- country_comparison_regex = /
21
- <span \s class="category"[^>]*>
22
- country \s comparison \s to \s the \s world:
23
- <\/span>
24
- \s*
25
- <span \s class="category_data"[^>]*>
26
- \s*
27
- <a \s [^>]+>
28
- .+?
29
- <\/a>
30
- \s*
31
- <\/span>
32
- /xm
33
-
34
- country_comparison_space_regex = /
35
- country \s comparison \s to \s the \s world:
36
- /xm
37
-
38
- country_comparison_span_regex = /
39
- <span \s class="category"[^>]*>
40
- /xm
41
-
42
- country_comparison_cat_regex = /
43
- <span \s class="category"[^>]*>
44
- country \s comparison \s to \s the \s world:
45
- <\/span>
46
- /xm
47
-
48
-
49
- m = country_comparison_space_regex.match( html )
50
- pp m
51
- assert m # must find a match
52
-
53
- m = country_comparison_span_regex.match( html )
54
- pp m
55
- assert m # must find a match
56
-
57
- m = country_comparison_cat_regex.match( html )
58
- pp m
59
- assert m # must find a match
60
-
61
- m = country_comparison_regex.match( html )
62
- pp m
63
- assert m # must find a match
64
- end
65
-
66
- end # class TestStrip