factbook 0.0.1 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/test/helper.rb ADDED
@@ -0,0 +1,15 @@
1
+ # encoding: utf-8
2
+
3
+ ## $:.unshift(File.dirname(__FILE__))
4
+
5
+ ## minitest setup
6
+
7
+ # require 'minitest/unit'
8
+ require 'minitest/autorun'
9
+
10
+ # include MiniTest::Unit # lets us use TestCase instead of MiniTest::Unit::TestCase
11
+
12
+ ## our own code
13
+
14
+ require 'factbook'
15
+
data/test/test_json.rb ADDED
@@ -0,0 +1,45 @@
1
+ # encoding: utf-8
2
+
3
+
4
+ require 'helper'
5
+
6
+
7
+ class TestJson < MiniTest::Unit::TestCase
8
+
9
+ def setup
10
+ Dir.mkdir( 'tmp' ) unless Dir.exists?( 'tmp' )
11
+ end
12
+
13
+ def test_json
14
+ gen_json_for( 'au')
15
+ gen_json_for( 'be')
16
+ gen_json_for( 'br')
17
+ gen_json_for( 'mx')
18
+ end
19
+
20
+ def gen_json_for( code )
21
+ page = Factbook::Page.new( code )
22
+ page.html = File.read( "#{Factbook.root}/test/data/countrytemplate_#{code}.html" )
23
+
24
+ ## print first 600 chars
25
+ pp page.html[0..600]
26
+
27
+ ## save for debuging
28
+
29
+ puts "saving a copy to #{code}.html for debugging"
30
+ File.open( "tmp/#{code}.html", 'w') do |f|
31
+ f.write( page.html )
32
+ end
33
+
34
+ h = page.data
35
+ pp h
36
+
37
+ ### save to json
38
+ puts "saving a copy to #{code}.json for debugging"
39
+ File.open( "tmp/#{code}.json", 'w') do |f|
40
+ f.write( JSON.pretty_generate( h ) )
41
+ end
42
+ end
43
+
44
+
45
+ end # class TestJson
data/test/test_page.rb ADDED
@@ -0,0 +1,227 @@
1
+ # encoding: utf-8
2
+
3
+
4
+ require 'helper'
5
+
6
+
7
+ class TestPage < MiniTest::Unit::TestCase
8
+
9
+ def setup
10
+ Dir.mkdir( 'tmp' ) unless Dir.exists?( 'tmp' )
11
+ end
12
+
13
+ def test_br
14
+ page = Factbook::Page.new( 'br' )
15
+
16
+ page.html = File.read( "#{Factbook.root}/test/data/countrytemplate_br.html" )
17
+
18
+ ## print first 600 chars
19
+ pp page.html[0..600]
20
+
21
+ ## save for debuging
22
+
23
+ puts "saving a copy to br.html for debugging"
24
+ File.open( 'tmp/br.html', 'w') do |f|
25
+ f.write( page.html )
26
+ end
27
+
28
+ doc = page.doc
29
+ sects = page.sects
30
+
31
+ h = page.data
32
+ pp h
33
+
34
+ ### save to json
35
+ puts "saving a copy to br.json for debugging"
36
+ File.open( 'tmp/br.json', 'w') do |f|
37
+ f.write( JSON.pretty_generate( h ) )
38
+ end
39
+ end
40
+
41
+
42
+ def xxx_test_br
43
+ page = Factbook::Page.new( 'br' )
44
+
45
+ ## print first 600 chars
46
+ pp page.html[0..600]
47
+
48
+ ## save for debuging
49
+
50
+ Dir.mkdir( 'tmp' ) unless Dir.exists?( 'tmp' )
51
+ puts "saving a copy to br.html for debugging"
52
+ File.open( 'tmp/br.html', 'w') do |f|
53
+ f.write( page.html )
54
+ end
55
+
56
+ doc = page.doc
57
+ sects = page.sects
58
+
59
+ rows = doc.css( 'table tr' )
60
+ field_ids = rows.css( '#field' ) ## check - use div#field.category -- possible?
61
+ data_ids = rows.css( '#data' )
62
+
63
+ puts "rows.size: #{rows.size} (field_ids.size: #{field_ids.size} / data_ids.size: #{data_ids.size})"
64
+
65
+ cats = rows.css( '.category' )
66
+ cats_div = rows.css( 'div.category' )
67
+ cats_span = rows.css( 'span.category' )
68
+ cats_other_size = cats.size - cats_div.size - cats_span.size
69
+
70
+ cats_data = rows.css( '.category_data' )
71
+ cats_div_data = rows.css( 'div.category_data' )
72
+ cats_span_data = rows.css( 'span.category_data' )
73
+ cats_other_data_size = cats_data.size - cats_div_data.size - cats_span_data.size
74
+
75
+ puts "cats.size: #{cats.size} (cats_div.size #{cats_div.size} / cats_span.size #{cats_span.size} / cats_other.size #{cats_other_size})"
76
+ puts "cats_data.size: #{cats_data.size} (cats_div_data.size #{cats_div_data.size} / cats_span_data.size #{cats_span_data.size} / cats_other_data.size #{cats_other_data_size})"
77
+
78
+ ## some check for structure
79
+ if cats_other_size > 0
80
+ puts " ****!!!! category other (not div/span) found - #{cats_other_size}"
81
+ end
82
+
83
+ if cats_other_data_size > 0
84
+ puts " ****!!!! category_data other (not div/span) found - #{cats_other_data_size}"
85
+ end
86
+
87
+ ## stats( doc )
88
+
89
+ sects.each_with_index do |sect,i|
90
+ puts ''
91
+ puts "############################"
92
+ puts "#### stats sect #{i}:"
93
+ pp page.sect_to_hash( sect )
94
+ end
95
+ end
96
+
97
+
98
+ def stats( doc )
99
+ rows = doc.css( 'table tr' )
100
+ cells = doc.css( 'table tr td' )
101
+ field_ids = rows.css( '#field' ) ## check - use div#field.category -- possible?
102
+ data_ids = rows.css( '#data' )
103
+
104
+ puts "rows.size: #{rows.size} (cells.size: #{cells.size} / field_ids.size: #{field_ids.size} / data_ids.size: #{data_ids.size})"
105
+
106
+ hash = {}
107
+ last_cat = nil
108
+
109
+
110
+ cells.each_with_index do |cell,i|
111
+ ## next if i > 14 ## skip after xx for debugging for now
112
+
113
+ # check if field or data id
114
+
115
+ # check for (nested) div#field in td
116
+ has_field_id = cell.css( '#field' ).size == 1 ? true : false
117
+
118
+ # check for td#data
119
+ has_data_id = cell['id'] == 'data' ? true : false
120
+
121
+ if has_field_id
122
+
123
+ cats = cell.css( 'div.category' ) ## note: ignore all .category not using div (issue warn/err if found!!) etc.
124
+ if cats.size == 1
125
+ text = cats.first.text.strip # remove/strip leading and trailing spaces
126
+ last_cat = text
127
+ puts " [#{i}] category: >>#{text}<<"
128
+ else
129
+ puts "**** !!!!!! warn/err - found element w/ field id (no match for subsection!!! - check)"
130
+ puts cell.to_s
131
+ end
132
+
133
+ elsif has_data_id
134
+
135
+ cats = cell.css( 'div.category' ) ## note: ignore all .category not using div (issue warn/err if found!!) etc.
136
+ cats_data = cell.css( 'div.category_data,span.category_data' ) ## note: ignore a.category_data etc.
137
+ cats_div_data = cell.css( 'div.category_data' )
138
+ cats_span_data = cell.css( 'span.category_data' )
139
+
140
+ puts " - [#{i}] data cell - cats: #{cats.size}, cats_data: #{cats_data.size} (cats_div_data: #{cats_div_data.size} / cats_span_data: #{cats_span_data.size})"
141
+
142
+ pairs = []
143
+ last_pair = nil
144
+ last_pair_data_count = 0
145
+
146
+ ## loop over div blocks (might be .category or .category_data)
147
+ cell.children.each_with_index do |child,j|
148
+ unless child.element?
149
+ ## puts " **** !!!! skipping non-element type >#{child.type}<:"
150
+ ## puts child.to_s
151
+ next
152
+ end
153
+ unless child.name == 'div'
154
+ puts " **** !!! skipping non-div >#{child.name}<:"
155
+ puts child.to_s
156
+ next
157
+ end
158
+
159
+ ### check if .category or .category_data
160
+ if child['class'] == 'category'
161
+
162
+ ## collect text for category; exclude element w/ class.category_data
163
+ text = ""
164
+ child.children.each do |subchild|
165
+ text << subchild.text.strip unless subchild.element? && subchild['class'] == 'category_data'
166
+ end
167
+
168
+ value = child.css('span.category_data').text.strip
169
+
170
+ puts " -- category >>#{text}<<"
171
+
172
+ ## start new pair
173
+ last_pair = [ text, value ]
174
+ last_pair_data_count = 0
175
+ pairs << last_pair
176
+
177
+ elsif child['class'] == 'category_data'
178
+ puts " -- category_data"
179
+
180
+ text = child.text.strip
181
+
182
+ if last_pair.nil?
183
+ ## assume its the very first entry; use implied/auto-created category
184
+ last_pair = [ 'text', '' ]
185
+ last_pair_data_count = 0
186
+ pairs << last_pair
187
+ end
188
+
189
+ ### first category_data element?
190
+ if last_pair_data_count == 0
191
+ if last_pair[1] == ''
192
+ last_pair[1] = text
193
+ else
194
+ last_pair[1] += " #{text}" ## append w/o separator
195
+ end
196
+ else
197
+ last_pair[1] += "; #{text}" ## append with separator
198
+ end
199
+ last_pair_data_count += 1
200
+
201
+ else
202
+ puts " **** !!! skipping div w/o category or category_data class:"
203
+ puts child.to_s
204
+ end
205
+ end
206
+
207
+ ## pp pairs
208
+
209
+ ## pairs to hash
210
+ pairs_hash = {}
211
+ pairs.each do |pair|
212
+ pairs_hash[ pair[0] ] = pair[1]
213
+ end
214
+
215
+ hash[ last_cat ] = pairs_hash
216
+
217
+ else
218
+ puts "#### !!!! unknown cell type (no field or data id found):"
219
+ puts cell.to_s
220
+ end
221
+ end # each cell
222
+
223
+ pp hash
224
+ end # method stats
225
+
226
+
227
+ end # class TestPage
@@ -0,0 +1,290 @@
1
+ # encoding: utf-8
2
+
3
+
4
+ require 'helper'
5
+
6
+
7
+ class TestPageOld < MiniTest::Unit::TestCase
8
+
9
+ def xxx_test_mx
10
+ page = Factbook::Page.new( 'mx' )
11
+
12
+ ## print first 600 chars
13
+ pp page.html[0..600]
14
+
15
+ doc = page.doc
16
+
17
+ panels = doc.css( '.CollapsiblePanel' )
18
+ questions = doc.css( '.question' )
19
+ answers = doc.css( '.answer' )
20
+
21
+ puts "panels.size: #{panels.size}"
22
+ puts "questions.size: #{questions.size}"
23
+ puts "answers.size: #{answers.size}"
24
+
25
+ cats0 = panels[0].css( '.category' )
26
+ cats0_data = panels[0].css( '.category_data' )
27
+
28
+ puts "cats0.size: #{cats0.size}"
29
+ puts "cats0_data.size: #{cats0_data.size}"
30
+
31
+ cats1 = panels[1].css( '.category' )
32
+ cats1_data = panels[1].css( '.category_data' )
33
+
34
+ puts "cats1.size: #{cats1.size}"
35
+ puts "cats1_data.size: #{cats1_data.size}"
36
+
37
+
38
+ ## fix: use cats -- add s
39
+ cat = doc.css( '#CollapsiblePanel1_Geo div.category' )
40
+ puts "cat.size: #{cat.size}"
41
+
42
+ catcheck = doc.css( '#CollapsiblePanel1_Geo .category' )
43
+ puts "catcheck.size: #{catcheck.size}"
44
+
45
+ catcheck2 = doc.css( '.category' )
46
+ puts "catcheck2.size: #{catcheck2.size}"
47
+
48
+
49
+ catdata = doc.css( '#CollapsiblePanel1_Geo .category_data' )
50
+ puts "catdata.size: #{catdata.size}"
51
+
52
+ catdatacheck2 = doc.css( '.category_data' )
53
+ puts "catdatacheck2.size: #{catdatacheck2.size}"
54
+
55
+ puts "catdata[0]:"
56
+ pp catdata[0]
57
+
58
+ puts "catdata[1]:"
59
+ pp catdata[1]
60
+
61
+ # puts "catdata[2]:"
62
+ # pp catdata[2]
63
+
64
+ # puts "catdata[0].text():"
65
+ # pp catdata[0].text()
66
+
67
+ # puts "cat[0].text():"
68
+ # pp cat[0].text()
69
+
70
+ # cat.each_with_index do |c,i|
71
+ # puts "[#{i+1}]: ========================="
72
+ # puts ">>#{c.text()}<<"
73
+ # end
74
+
75
+ end
76
+
77
+ def xxxx_test_mx
78
+ page = Factbook::Page.new( 'mx' )
79
+
80
+ ## print first 600 chars
81
+ pp page.html[0..600]
82
+
83
+ ## save for debuging
84
+
85
+ Dir.mkdir( 'tmp' ) unless Dir.exists?( 'tmp' )
86
+ puts "saving a copy to mx.html for debugging"
87
+ File.open( 'tmp/mx.html', 'w') do |f|
88
+ f.write( page.html )
89
+ end
90
+
91
+ doc = page.doc
92
+ sects = page.sects
93
+
94
+ panels = doc.css( '.CollapsiblePanel' )
95
+ questions = doc.css( '.question' )
96
+ answers = doc.css( '.answer' )
97
+
98
+ puts "panels.size: #{panels.size}"
99
+ puts "questions.size: #{questions.size}"
100
+ puts "answers.size: #{answers.size}"
101
+
102
+ rows_total = 0
103
+ panels.each_with_index do |panel,i|
104
+ rows = panel.css( 'table tr' )
105
+ puts " [#{i}] rows.size: #{rows.size}"
106
+ rows_total += rows.size
107
+ end
108
+
109
+ puts "rows_total: #{rows_total}"
110
+
111
+ rows = doc.css( 'table tr' )
112
+ field_ids = rows.css( '#field' ) ## check - use div#field.category -- possible?
113
+ data_ids = rows.css( '#data' )
114
+
115
+ puts "rows.size: #{rows.size} (field_ids.size: #{field_ids.size} / data_ids.size: #{data_ids.size})"
116
+
117
+ cats = rows.css( '.category' )
118
+ cats_div = rows.css( 'div.category' )
119
+ cats_span = rows.css( 'span.category' )
120
+ cats_other_size = cats.size - cats_div.size - cats_span.size
121
+
122
+ cats_data = rows.css( '.category_data' )
123
+ cats_div_data = rows.css( 'div.category_data' )
124
+ cats_span_data = rows.css( 'span.category_data' )
125
+ cats_other_data_size = cats_data.size - cats_div_data.size - cats_span_data.size
126
+
127
+ puts "cats.size: #{cats.size} (cats_div.size #{cats_div.size} / cats_span.size #{cats_span.size} / cats_other.size #{cats_other_size})"
128
+ puts "cats_data.size: #{cats_data.size} (cats_div_data.size #{cats_div_data.size} / cats_span_data.size #{cats_span_data.size} / cats_other_data.size #{cats_other_data_size})"
129
+
130
+ ## some check for structure
131
+ if cats_other_size > 0
132
+ puts " ****!!!! category other (not div/span) found - #{cats_other_size}"
133
+ end
134
+
135
+ if cats_other_data_size > 0
136
+ puts " ****!!!! category_data other (not div/span) found - #{cats_other_data_size}"
137
+ end
138
+
139
+ ## stats( doc )
140
+
141
+ sects.each_with_index do |sect,i|
142
+ puts ''
143
+ puts "############################"
144
+ puts "#### stats sect #{i}:"
145
+ stats( sect )
146
+ end
147
+ end
148
+
149
+
150
+ def stats( doc )
151
+ rows = doc.css( 'table tr' )
152
+ cells = doc.css( 'table tr td' )
153
+ field_ids = rows.css( '#field' ) ## check - use div#field.category -- possible?
154
+ data_ids = rows.css( '#data' )
155
+
156
+ puts "rows.size: #{rows.size} (cells.size: #{cells.size} / field_ids.size: #{field_ids.size} / data_ids.size: #{data_ids.size})"
157
+
158
+
159
+ ## check rows
160
+ ## todo/fix:
161
+ ## loop over td's !!!
162
+
163
+ cells.each_with_index do |cell,i|
164
+ ## next if i > 14 ## skip after xx for debugging for now
165
+
166
+ cats = cell.css( 'div.category' ) ## note: ignore all .category not using div (issue warn/err if found!!) etc.
167
+ cats_data = cell.css( 'div.category_data,span.category_data' ) ## note: ignore a.category_data etc.
168
+ cats_div_data = cell.css( 'div.category_data' )
169
+ cats_span_data = cell.css( 'span.category_data' )
170
+
171
+ field_ids = cell.css( '#field' ) ## td div.field check - use div#field.category -- possible?
172
+
173
+ ### fix: split into #field and #data
174
+ ## field has no category-data no sub/multiple categories etc.
175
+
176
+ ## td#data
177
+ # quick hack: use parent() - fix!! check id for element if present and is data how?? e.g. cell['id'] == 'data' ???
178
+ data_ids = cell.parent.css( '#data' ) ## will include self? e.g. td id='data' ???
179
+
180
+ ids_size = field_ids.size + data_ids.size
181
+
182
+ if ids_size == 0
183
+ puts " ****!!!! no ids (field/data) found"
184
+ end
185
+
186
+ if ids_size > 1
187
+ puts " ***!!! more than one id (field/data) found - #{ids_size}"
188
+ end
189
+
190
+
191
+ ## check for subcategory
192
+ ## must be div w/ id field and class category
193
+
194
+ if field_ids.size == 1 ## assume category
195
+
196
+ if cats.size == 1 && cats_data.size == 0 && cats.first.name == 'div'
197
+ text = cats.first.text.strip # remove/strip leading and trailing spaces
198
+ puts " [#{i}] category: >>#{text}<<"
199
+ else
200
+ puts "**** !!!!!! warn/err - found element w/ field id (no match for subsection!!! - check)"
201
+ end
202
+
203
+ elsif data_ids.size == 1
204
+
205
+ if cats.size == 0
206
+ if cats_data.size == 1 ## check for cats_data.first.name == 'div' too ???
207
+ text = cats_data.first.text.strip # remove/strip leading and trailing spaces
208
+ puts " - [#{i}] data: >>#{text}<<"
209
+ elsif cats_data.size > 1 ## check for cats_data.first.name == 'div' too ???
210
+ ary = []
211
+ cats_data.each do |cat_data|
212
+ ary << cat_data.text.strip
213
+ end
214
+ text = ary.join( '; ' )
215
+ puts " - [#{i}] data#{cats_data.size}: >>#{text}<<"
216
+ else
217
+ # should not happen
218
+ puts "*** !!!! warn/err - skip empty data cell (no cats/no cats_data)"
219
+ end
220
+ elsif cats.size > 0
221
+ puts " [#{i}] cats: #{cats.size}, cats_data: #{cats_data.size} (cats_div_data: #{cats_div_data.size}/ cats_span_data: #{cats_span_data.size})"
222
+
223
+
224
+ ## check for "free standing" data blocks (not assigned to category/key)
225
+ if cats_div_data.size > 1
226
+ if cats_div_data.size == 1 #
227
+ # check if first or last entry (if first entry use key *text*; otherwise use key *notes*)
228
+ else ## multiple (more than one) data divs
229
+ if cats.size == 1
230
+ # always assume text for now (not *notes*)
231
+ else
232
+ # multiple cats and multiple data divs (e.g. drinking water source:)
233
+ # to be done - for now use one all-in-one text blob
234
+ end
235
+ end
236
+ end
237
+
238
+ cats.each_with_index do |cat,j| # note: use index - j (for inner loop)
239
+ ## get text from direct child / children
240
+ ## do NOT included text from nested span - how? possible?
241
+ ## text = cat.css( ':not( .category_data )' ).text.strip ## will it include text node(s)??
242
+ ## text = cat.text.strip ## will it include text node(s)??
243
+ ## text = cat.css( '*:not(.category_data)' ).text.strip
244
+ # Find the content of all child text nodes and join them together
245
+
246
+ ## collect text for category; exclude element w/ class.category_data
247
+ text = ""
248
+ cat.children.each do |child|
249
+ text << child.text.strip unless child.element? && child['class'] == 'category_data'
250
+ end
251
+
252
+ ## text = cat.xpath('text()').text.strip
253
+
254
+ n = cat.css( '.category_data' )
255
+ ## or use
256
+ ## text = cat.children.first.text ??
257
+ puts " -- [#{j}] subcategory: >>#{text}<< cats_data: #{n.size}"
258
+ ## pp cat.css( '*:not(.category_data)' )
259
+ ## pp cat.css( "*:not(*[@class='category_data'])" ) # *[@class='someclass']
260
+ ## pp cat
261
+ ## check if is div - if not issue warn
262
+ if cat.name == 'div'
263
+ ## check if includes one or more category_data nodes
264
+ if n.size == 0
265
+ puts " ****** !!! no category_data inside"
266
+ end
267
+ if n.size > 1
268
+ puts " ****** !!! multiple category_data's inside - #{n.size}"
269
+ end
270
+ else
271
+ puts " ****** !!!! no div - is >>#{cat.name}<<"
272
+ end
273
+ end
274
+ else
275
+ puts "**** !!!!!! warn/err - found element w/ data id (no cats, no cats-data) [#{i}] cats: #{cats.size}, cats_data: #{cats_data.size}, data_ids: #{data_ids.size}"
276
+ end
277
+ else
278
+ puts "**** !!!!!!! [#{i}] cats: #{cats.size}, cats_data: #{cats_data.size}, field_ids: #{field_ids.size}, data_ids: #{data_ids.size}"
279
+ end
280
+
281
+
282
+ if cats.size > 1
283
+ ## puts cell.to_s
284
+ end
285
+ end # each cell
286
+
287
+ end
288
+
289
+
290
+ end # class TestPageOld
@@ -0,0 +1,66 @@
1
+ # encoding: utf-8
2
+
3
+
4
+ require 'helper'
5
+
6
+
7
+ class TestStrip < MiniTest::Unit::TestCase
8
+
9
+ def test_country_comparison
10
+
11
+ html=<<EOS
12
+
13
+ <span class="category" style="padding-left:7px;">country comparison to the world:</span> <span class="category_data"> <a href="../rankorder/2147rank.html?countryname=Brazil&countrycode=br&regionCode=soa&rank=5#br" onMouseDown="" title="Country comparison to the world" alt="Country comparison to the world"> 5 </a> </span>
14
+
15
+ EOS
16
+
17
+ ## note: need to escapce space!!!! e.g. use to\s the\s world etc.
18
+ ## Note: To match whitespace in an x pattern use an escape such as \s or \p{Space}.
19
+
20
+ country_comparison_regex = /
21
+ <span \s class="category"[^>]*>
22
+ country \s comparison \s to \s the \s world:
23
+ <\/span>
24
+ \s*
25
+ <span \s class="category_data"[^>]*>
26
+ \s*
27
+ <a \s [^>]+>
28
+ .+?
29
+ <\/a>
30
+ \s*
31
+ <\/span>
32
+ /xm
33
+
34
+ country_comparison_space_regex = /
35
+ country \s comparison \s to \s the \s world:
36
+ /xm
37
+
38
+ country_comparison_span_regex = /
39
+ <span \s class="category"[^>]*>
40
+ /xm
41
+
42
+ country_comparison_cat_regex = /
43
+ <span \s class="category"[^>]*>
44
+ country \s comparison \s to \s the \s world:
45
+ <\/span>
46
+ /xm
47
+
48
+
49
+ m = country_comparison_space_regex.match( html )
50
+ pp m
51
+ assert m # must find a match
52
+
53
+ m = country_comparison_span_regex.match( html )
54
+ pp m
55
+ assert m # must find a match
56
+
57
+ m = country_comparison_cat_regex.match( html )
58
+ pp m
59
+ assert m # must find a match
60
+
61
+ m = country_comparison_regex.match( html )
62
+ pp m
63
+ assert m # must find a match
64
+ end
65
+
66
+ end # class TestStrip