factbook 0.0.1 → 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
data/test/helper.rb ADDED
@@ -0,0 +1,15 @@
1
+ # encoding: utf-8
2
+
3
+ ## $:.unshift(File.dirname(__FILE__))
4
+
5
+ ## minitest setup
6
+
7
+ # require 'minitest/unit'
8
+ require 'minitest/autorun'
9
+
10
+ # include MiniTest::Unit # lets us use TestCase instead of MiniTest::Unit::TestCase
11
+
12
+ ## our own code
13
+
14
+ require 'factbook'
15
+
data/test/test_json.rb ADDED
@@ -0,0 +1,45 @@
1
+ # encoding: utf-8
2
+
3
+
4
+ require 'helper'
5
+
6
+
7
+ class TestJson < MiniTest::Unit::TestCase
8
+
9
+ def setup
10
+ Dir.mkdir( 'tmp' ) unless Dir.exists?( 'tmp' )
11
+ end
12
+
13
+ def test_json
14
+ gen_json_for( 'au')
15
+ gen_json_for( 'be')
16
+ gen_json_for( 'br')
17
+ gen_json_for( 'mx')
18
+ end
19
+
20
+ def gen_json_for( code )
21
+ page = Factbook::Page.new( code )
22
+ page.html = File.read( "#{Factbook.root}/test/data/countrytemplate_#{code}.html" )
23
+
24
+ ## print first 600 chars
25
+ pp page.html[0..600]
26
+
27
+ ## save for debuging
28
+
29
+ puts "saving a copy to #{code}.html for debugging"
30
+ File.open( "tmp/#{code}.html", 'w') do |f|
31
+ f.write( page.html )
32
+ end
33
+
34
+ h = page.data
35
+ pp h
36
+
37
+ ### save to json
38
+ puts "saving a copy to #{code}.json for debugging"
39
+ File.open( "tmp/#{code}.json", 'w') do |f|
40
+ f.write( JSON.pretty_generate( h ) )
41
+ end
42
+ end
43
+
44
+
45
+ end # class TestJson
data/test/test_page.rb ADDED
@@ -0,0 +1,227 @@
1
+ # encoding: utf-8
2
+
3
+
4
+ require 'helper'
5
+
6
+
7
+ class TestPage < MiniTest::Unit::TestCase
8
+
9
+ def setup
10
+ Dir.mkdir( 'tmp' ) unless Dir.exists?( 'tmp' )
11
+ end
12
+
13
+ def test_br
14
+ page = Factbook::Page.new( 'br' )
15
+
16
+ page.html = File.read( "#{Factbook.root}/test/data/countrytemplate_br.html" )
17
+
18
+ ## print first 600 chars
19
+ pp page.html[0..600]
20
+
21
+ ## save for debuging
22
+
23
+ puts "saving a copy to br.html for debugging"
24
+ File.open( 'tmp/br.html', 'w') do |f|
25
+ f.write( page.html )
26
+ end
27
+
28
+ doc = page.doc
29
+ sects = page.sects
30
+
31
+ h = page.data
32
+ pp h
33
+
34
+ ### save to json
35
+ puts "saving a copy to br.json for debugging"
36
+ File.open( 'tmp/br.json', 'w') do |f|
37
+ f.write( JSON.pretty_generate( h ) )
38
+ end
39
+ end
40
+
41
+
42
+ def xxx_test_br
43
+ page = Factbook::Page.new( 'br' )
44
+
45
+ ## print first 600 chars
46
+ pp page.html[0..600]
47
+
48
+ ## save for debuging
49
+
50
+ Dir.mkdir( 'tmp' ) unless Dir.exists?( 'tmp' )
51
+ puts "saving a copy to br.html for debugging"
52
+ File.open( 'tmp/br.html', 'w') do |f|
53
+ f.write( page.html )
54
+ end
55
+
56
+ doc = page.doc
57
+ sects = page.sects
58
+
59
+ rows = doc.css( 'table tr' )
60
+ field_ids = rows.css( '#field' ) ## check - use div#field.category -- possible?
61
+ data_ids = rows.css( '#data' )
62
+
63
+ puts "rows.size: #{rows.size} (field_ids.size: #{field_ids.size} / data_ids.size: #{data_ids.size})"
64
+
65
+ cats = rows.css( '.category' )
66
+ cats_div = rows.css( 'div.category' )
67
+ cats_span = rows.css( 'span.category' )
68
+ cats_other_size = cats.size - cats_div.size - cats_span.size
69
+
70
+ cats_data = rows.css( '.category_data' )
71
+ cats_div_data = rows.css( 'div.category_data' )
72
+ cats_span_data = rows.css( 'span.category_data' )
73
+ cats_other_data_size = cats_data.size - cats_div_data.size - cats_span_data.size
74
+
75
+ puts "cats.size: #{cats.size} (cats_div.size #{cats_div.size} / cats_span.size #{cats_span.size} / cats_other.size #{cats_other_size})"
76
+ puts "cats_data.size: #{cats_data.size} (cats_div_data.size #{cats_div_data.size} / cats_span_data.size #{cats_span_data.size} / cats_other_data.size #{cats_other_data_size})"
77
+
78
+ ## some check for structure
79
+ if cats_other_size > 0
80
+ puts " ****!!!! category other (not div/span) found - #{cats_other_size}"
81
+ end
82
+
83
+ if cats_other_data_size > 0
84
+ puts " ****!!!! category_data other (not div/span) found - #{cats_other_data_size}"
85
+ end
86
+
87
+ ## stats( doc )
88
+
89
+ sects.each_with_index do |sect,i|
90
+ puts ''
91
+ puts "############################"
92
+ puts "#### stats sect #{i}:"
93
+ pp page.sect_to_hash( sect )
94
+ end
95
+ end
96
+
97
+
98
+ def stats( doc )
99
+ rows = doc.css( 'table tr' )
100
+ cells = doc.css( 'table tr td' )
101
+ field_ids = rows.css( '#field' ) ## check - use div#field.category -- possible?
102
+ data_ids = rows.css( '#data' )
103
+
104
+ puts "rows.size: #{rows.size} (cells.size: #{cells.size} / field_ids.size: #{field_ids.size} / data_ids.size: #{data_ids.size})"
105
+
106
+ hash = {}
107
+ last_cat = nil
108
+
109
+
110
+ cells.each_with_index do |cell,i|
111
+ ## next if i > 14 ## skip after xx for debugging for now
112
+
113
+ # check if field or data id
114
+
115
+ # check for (nested) div#field in td
116
+ has_field_id = cell.css( '#field' ).size == 1 ? true : false
117
+
118
+ # check for td#data
119
+ has_data_id = cell['id'] == 'data' ? true : false
120
+
121
+ if has_field_id
122
+
123
+ cats = cell.css( 'div.category' ) ## note: ignore all .category not using div (issue warn/err if found!!) etc.
124
+ if cats.size == 1
125
+ text = cats.first.text.strip # remove/strip leading and trailing spaces
126
+ last_cat = text
127
+ puts " [#{i}] category: >>#{text}<<"
128
+ else
129
+ puts "**** !!!!!! warn/err - found element w/ field id (no match for subsection!!! - check)"
130
+ puts cell.to_s
131
+ end
132
+
133
+ elsif has_data_id
134
+
135
+ cats = cell.css( 'div.category' ) ## note: ignore all .category not using div (issue warn/err if found!!) etc.
136
+ cats_data = cell.css( 'div.category_data,span.category_data' ) ## note: ignore a.category_data etc.
137
+ cats_div_data = cell.css( 'div.category_data' )
138
+ cats_span_data = cell.css( 'span.category_data' )
139
+
140
+ puts " - [#{i}] data cell - cats: #{cats.size}, cats_data: #{cats_data.size} (cats_div_data: #{cats_div_data.size} / cats_span_data: #{cats_span_data.size})"
141
+
142
+ pairs = []
143
+ last_pair = nil
144
+ last_pair_data_count = 0
145
+
146
+ ## loop over div blocks (might be .category or .category_data)
147
+ cell.children.each_with_index do |child,j|
148
+ unless child.element?
149
+ ## puts " **** !!!! skipping non-element type >#{child.type}<:"
150
+ ## puts child.to_s
151
+ next
152
+ end
153
+ unless child.name == 'div'
154
+ puts " **** !!! skipping non-div >#{child.name}<:"
155
+ puts child.to_s
156
+ next
157
+ end
158
+
159
+ ### check if .category or .category_data
160
+ if child['class'] == 'category'
161
+
162
+ ## collect text for category; exclude element w/ class.category_data
163
+ text = ""
164
+ child.children.each do |subchild|
165
+ text << subchild.text.strip unless subchild.element? && subchild['class'] == 'category_data'
166
+ end
167
+
168
+ value = child.css('span.category_data').text.strip
169
+
170
+ puts " -- category >>#{text}<<"
171
+
172
+ ## start new pair
173
+ last_pair = [ text, value ]
174
+ last_pair_data_count = 0
175
+ pairs << last_pair
176
+
177
+ elsif child['class'] == 'category_data'
178
+ puts " -- category_data"
179
+
180
+ text = child.text.strip
181
+
182
+ if last_pair.nil?
183
+ ## assume its the very first entry; use implied/auto-created category
184
+ last_pair = [ 'text', '' ]
185
+ last_pair_data_count = 0
186
+ pairs << last_pair
187
+ end
188
+
189
+ ### first category_data element?
190
+ if last_pair_data_count == 0
191
+ if last_pair[1] == ''
192
+ last_pair[1] = text
193
+ else
194
+ last_pair[1] += " #{text}" ## append w/o separator
195
+ end
196
+ else
197
+ last_pair[1] += "; #{text}" ## append with separator
198
+ end
199
+ last_pair_data_count += 1
200
+
201
+ else
202
+ puts " **** !!! skipping div w/o category or category_data class:"
203
+ puts child.to_s
204
+ end
205
+ end
206
+
207
+ ## pp pairs
208
+
209
+ ## pairs to hash
210
+ pairs_hash = {}
211
+ pairs.each do |pair|
212
+ pairs_hash[ pair[0] ] = pair[1]
213
+ end
214
+
215
+ hash[ last_cat ] = pairs_hash
216
+
217
+ else
218
+ puts "#### !!!! unknown cell type (no field or data id found):"
219
+ puts cell.to_s
220
+ end
221
+ end # each cell
222
+
223
+ pp hash
224
+ end # method stats
225
+
226
+
227
+ end # class TestPage
@@ -0,0 +1,290 @@
1
+ # encoding: utf-8
2
+
3
+
4
+ require 'helper'
5
+
6
+
7
+ class TestPageOld < MiniTest::Unit::TestCase
8
+
9
+ def xxx_test_mx
10
+ page = Factbook::Page.new( 'mx' )
11
+
12
+ ## print first 600 chars
13
+ pp page.html[0..600]
14
+
15
+ doc = page.doc
16
+
17
+ panels = doc.css( '.CollapsiblePanel' )
18
+ questions = doc.css( '.question' )
19
+ answers = doc.css( '.answer' )
20
+
21
+ puts "panels.size: #{panels.size}"
22
+ puts "questions.size: #{questions.size}"
23
+ puts "answers.size: #{answers.size}"
24
+
25
+ cats0 = panels[0].css( '.category' )
26
+ cats0_data = panels[0].css( '.category_data' )
27
+
28
+ puts "cats0.size: #{cats0.size}"
29
+ puts "cats0_data.size: #{cats0_data.size}"
30
+
31
+ cats1 = panels[1].css( '.category' )
32
+ cats1_data = panels[1].css( '.category_data' )
33
+
34
+ puts "cats1.size: #{cats1.size}"
35
+ puts "cats1_data.size: #{cats1_data.size}"
36
+
37
+
38
+ ## fix: use cats -- add s
39
+ cat = doc.css( '#CollapsiblePanel1_Geo div.category' )
40
+ puts "cat.size: #{cat.size}"
41
+
42
+ catcheck = doc.css( '#CollapsiblePanel1_Geo .category' )
43
+ puts "catcheck.size: #{catcheck.size}"
44
+
45
+ catcheck2 = doc.css( '.category' )
46
+ puts "catcheck2.size: #{catcheck2.size}"
47
+
48
+
49
+ catdata = doc.css( '#CollapsiblePanel1_Geo .category_data' )
50
+ puts "catdata.size: #{catdata.size}"
51
+
52
+ catdatacheck2 = doc.css( '.category_data' )
53
+ puts "catdatacheck2.size: #{catdatacheck2.size}"
54
+
55
+ puts "catdata[0]:"
56
+ pp catdata[0]
57
+
58
+ puts "catdata[1]:"
59
+ pp catdata[1]
60
+
61
+ # puts "catdata[2]:"
62
+ # pp catdata[2]
63
+
64
+ # puts "catdata[0].text():"
65
+ # pp catdata[0].text()
66
+
67
+ # puts "cat[0].text():"
68
+ # pp cat[0].text()
69
+
70
+ # cat.each_with_index do |c,i|
71
+ # puts "[#{i+1}]: ========================="
72
+ # puts ">>#{c.text()}<<"
73
+ # end
74
+
75
+ end
76
+
77
+ def xxxx_test_mx
78
+ page = Factbook::Page.new( 'mx' )
79
+
80
+ ## print first 600 chars
81
+ pp page.html[0..600]
82
+
83
+ ## save for debuging
84
+
85
+ Dir.mkdir( 'tmp' ) unless Dir.exists?( 'tmp' )
86
+ puts "saving a copy to mx.html for debugging"
87
+ File.open( 'tmp/mx.html', 'w') do |f|
88
+ f.write( page.html )
89
+ end
90
+
91
+ doc = page.doc
92
+ sects = page.sects
93
+
94
+ panels = doc.css( '.CollapsiblePanel' )
95
+ questions = doc.css( '.question' )
96
+ answers = doc.css( '.answer' )
97
+
98
+ puts "panels.size: #{panels.size}"
99
+ puts "questions.size: #{questions.size}"
100
+ puts "answers.size: #{answers.size}"
101
+
102
+ rows_total = 0
103
+ panels.each_with_index do |panel,i|
104
+ rows = panel.css( 'table tr' )
105
+ puts " [#{i}] rows.size: #{rows.size}"
106
+ rows_total += rows.size
107
+ end
108
+
109
+ puts "rows_total: #{rows_total}"
110
+
111
+ rows = doc.css( 'table tr' )
112
+ field_ids = rows.css( '#field' ) ## check - use div#field.category -- possible?
113
+ data_ids = rows.css( '#data' )
114
+
115
+ puts "rows.size: #{rows.size} (field_ids.size: #{field_ids.size} / data_ids.size: #{data_ids.size})"
116
+
117
+ cats = rows.css( '.category' )
118
+ cats_div = rows.css( 'div.category' )
119
+ cats_span = rows.css( 'span.category' )
120
+ cats_other_size = cats.size - cats_div.size - cats_span.size
121
+
122
+ cats_data = rows.css( '.category_data' )
123
+ cats_div_data = rows.css( 'div.category_data' )
124
+ cats_span_data = rows.css( 'span.category_data' )
125
+ cats_other_data_size = cats_data.size - cats_div_data.size - cats_span_data.size
126
+
127
+ puts "cats.size: #{cats.size} (cats_div.size #{cats_div.size} / cats_span.size #{cats_span.size} / cats_other.size #{cats_other_size})"
128
+ puts "cats_data.size: #{cats_data.size} (cats_div_data.size #{cats_div_data.size} / cats_span_data.size #{cats_span_data.size} / cats_other_data.size #{cats_other_data_size})"
129
+
130
+ ## some check for structure
131
+ if cats_other_size > 0
132
+ puts " ****!!!! category other (not div/span) found - #{cats_other_size}"
133
+ end
134
+
135
+ if cats_other_data_size > 0
136
+ puts " ****!!!! category_data other (not div/span) found - #{cats_other_data_size}"
137
+ end
138
+
139
+ ## stats( doc )
140
+
141
+ sects.each_with_index do |sect,i|
142
+ puts ''
143
+ puts "############################"
144
+ puts "#### stats sect #{i}:"
145
+ stats( sect )
146
+ end
147
+ end
148
+
149
+
150
+ def stats( doc )
151
+ rows = doc.css( 'table tr' )
152
+ cells = doc.css( 'table tr td' )
153
+ field_ids = rows.css( '#field' ) ## check - use div#field.category -- possible?
154
+ data_ids = rows.css( '#data' )
155
+
156
+ puts "rows.size: #{rows.size} (cells.size: #{cells.size} / field_ids.size: #{field_ids.size} / data_ids.size: #{data_ids.size})"
157
+
158
+
159
+ ## check rows
160
+ ## todo/fix:
161
+ ## loop over td's !!!
162
+
163
+ cells.each_with_index do |cell,i|
164
+ ## next if i > 14 ## skip after xx for debugging for now
165
+
166
+ cats = cell.css( 'div.category' ) ## note: ignore all .category not using div (issue warn/err if found!!) etc.
167
+ cats_data = cell.css( 'div.category_data,span.category_data' ) ## note: ignore a.category_data etc.
168
+ cats_div_data = cell.css( 'div.category_data' )
169
+ cats_span_data = cell.css( 'span.category_data' )
170
+
171
+ field_ids = cell.css( '#field' ) ## td div.field check - use div#field.category -- possible?
172
+
173
+ ### fix: split into #field and #data
174
+ ## field has no category-data no sub/multiple categories etc.
175
+
176
+ ## td#data
177
+ # quick hack: use parent() - fix!! check id for element if present and is data how?? e.g. cell['id'] == 'data' ???
178
+ data_ids = cell.parent.css( '#data' ) ## will include self? e.g. td id='data' ???
179
+
180
+ ids_size = field_ids.size + data_ids.size
181
+
182
+ if ids_size == 0
183
+ puts " ****!!!! no ids (field/data) found"
184
+ end
185
+
186
+ if ids_size > 1
187
+ puts " ***!!! more than one id (field/data) found - #{ids_size}"
188
+ end
189
+
190
+
191
+ ## check for subcategory
192
+ ## must be div w/ id field and class category
193
+
194
+ if field_ids.size == 1 ## assume category
195
+
196
+ if cats.size == 1 && cats_data.size == 0 && cats.first.name == 'div'
197
+ text = cats.first.text.strip # remove/strip leading and trailing spaces
198
+ puts " [#{i}] category: >>#{text}<<"
199
+ else
200
+ puts "**** !!!!!! warn/err - found element w/ field id (no match for subsection!!! - check)"
201
+ end
202
+
203
+ elsif data_ids.size == 1
204
+
205
+ if cats.size == 0
206
+ if cats_data.size == 1 ## check for cats_data.first.name == 'div' too ???
207
+ text = cats_data.first.text.strip # remove/strip leading and trailing spaces
208
+ puts " - [#{i}] data: >>#{text}<<"
209
+ elsif cats_data.size > 1 ## check for cats_data.first.name == 'div' too ???
210
+ ary = []
211
+ cats_data.each do |cat_data|
212
+ ary << cat_data.text.strip
213
+ end
214
+ text = ary.join( '; ' )
215
+ puts " - [#{i}] data#{cats_data.size}: >>#{text}<<"
216
+ else
217
+ # should not happen
218
+ puts "*** !!!! warn/err - skip empty data cell (no cats/no cats_data)"
219
+ end
220
+ elsif cats.size > 0
221
+ puts " [#{i}] cats: #{cats.size}, cats_data: #{cats_data.size} (cats_div_data: #{cats_div_data.size}/ cats_span_data: #{cats_span_data.size})"
222
+
223
+
224
+ ## check for "free standing" data blocks (not assigned to category/key)
225
+ if cats_div_data.size > 1
226
+ if cats_div_data.size == 1 #
227
+ # check if first or last entry (if first entry use key *text*; otherwise use key *notes*)
228
+ else ## multiple (more than one) data divs
229
+ if cats.size == 1
230
+ # always assume text for now (not *notes*)
231
+ else
232
+ # multiple cats and multiple data divs (e.g. drinking water source:)
233
+ # to be done - for now use one all-in-one text blob
234
+ end
235
+ end
236
+ end
237
+
238
+ cats.each_with_index do |cat,j| # note: use index - j (for inner loop)
239
+ ## get text from direct child / children
240
+ ## do NOT included text from nested span - how? possible?
241
+ ## text = cat.css( ':not( .category_data )' ).text.strip ## will it include text node(s)??
242
+ ## text = cat.text.strip ## will it include text node(s)??
243
+ ## text = cat.css( '*:not(.category_data)' ).text.strip
244
+ # Find the content of all child text nodes and join them together
245
+
246
+ ## collect text for category; exclude element w/ class.category_data
247
+ text = ""
248
+ cat.children.each do |child|
249
+ text << child.text.strip unless child.element? && child['class'] == 'category_data'
250
+ end
251
+
252
+ ## text = cat.xpath('text()').text.strip
253
+
254
+ n = cat.css( '.category_data' )
255
+ ## or use
256
+ ## text = cat.children.first.text ??
257
+ puts " -- [#{j}] subcategory: >>#{text}<< cats_data: #{n.size}"
258
+ ## pp cat.css( '*:not(.category_data)' )
259
+ ## pp cat.css( "*:not(*[@class='category_data'])" ) # *[@class='someclass']
260
+ ## pp cat
261
+ ## check if is div - if not issue warn
262
+ if cat.name == 'div'
263
+ ## check if includes one or more category_data nodes
264
+ if n.size == 0
265
+ puts " ****** !!! no category_data inside"
266
+ end
267
+ if n.size > 1
268
+ puts " ****** !!! multiple category_data's inside - #{n.size}"
269
+ end
270
+ else
271
+ puts " ****** !!!! no div - is >>#{cat.name}<<"
272
+ end
273
+ end
274
+ else
275
+ puts "**** !!!!!! warn/err - found element w/ data id (no cats, no cats-data) [#{i}] cats: #{cats.size}, cats_data: #{cats_data.size}, data_ids: #{data_ids.size}"
276
+ end
277
+ else
278
+ puts "**** !!!!!!! [#{i}] cats: #{cats.size}, cats_data: #{cats_data.size}, field_ids: #{field_ids.size}, data_ids: #{data_ids.size}"
279
+ end
280
+
281
+
282
+ if cats.size > 1
283
+ ## puts cell.to_s
284
+ end
285
+ end # each cell
286
+
287
+ end
288
+
289
+
290
+ end # class TestPageOld
@@ -0,0 +1,66 @@
1
+ # encoding: utf-8
2
+
3
+
4
+ require 'helper'
5
+
6
+
7
+ class TestStrip < MiniTest::Unit::TestCase
8
+
9
+ def test_country_comparison
10
+
11
+ html=<<EOS
12
+
13
+ <span class="category" style="padding-left:7px;">country comparison to the world:</span> <span class="category_data"> <a href="../rankorder/2147rank.html?countryname=Brazil&countrycode=br&regionCode=soa&rank=5#br" onMouseDown="" title="Country comparison to the world" alt="Country comparison to the world"> 5 </a> </span>
14
+
15
+ EOS
16
+
17
+ ## note: need to escapce space!!!! e.g. use to\s the\s world etc.
18
+ ## Note: To match whitespace in an x pattern use an escape such as \s or \p{Space}.
19
+
20
+ country_comparison_regex = /
21
+ <span \s class="category"[^>]*>
22
+ country \s comparison \s to \s the \s world:
23
+ <\/span>
24
+ \s*
25
+ <span \s class="category_data"[^>]*>
26
+ \s*
27
+ <a \s [^>]+>
28
+ .+?
29
+ <\/a>
30
+ \s*
31
+ <\/span>
32
+ /xm
33
+
34
+ country_comparison_space_regex = /
35
+ country \s comparison \s to \s the \s world:
36
+ /xm
37
+
38
+ country_comparison_span_regex = /
39
+ <span \s class="category"[^>]*>
40
+ /xm
41
+
42
+ country_comparison_cat_regex = /
43
+ <span \s class="category"[^>]*>
44
+ country \s comparison \s to \s the \s world:
45
+ <\/span>
46
+ /xm
47
+
48
+
49
+ m = country_comparison_space_regex.match( html )
50
+ pp m
51
+ assert m # must find a match
52
+
53
+ m = country_comparison_span_regex.match( html )
54
+ pp m
55
+ assert m # must find a match
56
+
57
+ m = country_comparison_cat_regex.match( html )
58
+ pp m
59
+ assert m # must find a match
60
+
61
+ m = country_comparison_regex.match( html )
62
+ pp m
63
+ assert m # must find a match
64
+ end
65
+
66
+ end # class TestStrip