factbook 0.0.1 → 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.gemtest +0 -0
- data/Manifest.txt +10 -0
- data/README.md +24 -2
- data/Rakefile +2 -1
- data/lib/factbook/page.rb +408 -0
- data/lib/factbook/version.rb +1 -1
- data/lib/factbook.rb +4 -0
- data/test/data/countrytemplate_au.html +4179 -0
- data/test/data/countrytemplate_be.html +4260 -0
- data/test/data/countrytemplate_br.html +4366 -0
- data/test/data/countrytemplate_mx.html +4397 -0
- data/test/helper.rb +15 -0
- data/test/test_json.rb +45 -0
- data/test/test_page.rb +227 -0
- data/test/test_page_old.rb +290 -0
- data/test/test_strip.rb +66 -0
- metadata +37 -11
data/test/helper.rb
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
## $:.unshift(File.dirname(__FILE__))
|
4
|
+
|
5
|
+
## minitest setup
|
6
|
+
|
7
|
+
# require 'minitest/unit'
|
8
|
+
require 'minitest/autorun'
|
9
|
+
|
10
|
+
# include MiniTest::Unit # lets us use TestCase instead of MiniTest::Unit::TestCase
|
11
|
+
|
12
|
+
## our own code
|
13
|
+
|
14
|
+
require 'factbook'
|
15
|
+
|
data/test/test_json.rb
ADDED
@@ -0,0 +1,45 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
|
4
|
+
require 'helper'
|
5
|
+
|
6
|
+
|
7
|
+
class TestJson < MiniTest::Unit::TestCase
|
8
|
+
|
9
|
+
def setup
|
10
|
+
Dir.mkdir( 'tmp' ) unless Dir.exists?( 'tmp' )
|
11
|
+
end
|
12
|
+
|
13
|
+
def test_json
|
14
|
+
gen_json_for( 'au')
|
15
|
+
gen_json_for( 'be')
|
16
|
+
gen_json_for( 'br')
|
17
|
+
gen_json_for( 'mx')
|
18
|
+
end
|
19
|
+
|
20
|
+
def gen_json_for( code )
|
21
|
+
page = Factbook::Page.new( code )
|
22
|
+
page.html = File.read( "#{Factbook.root}/test/data/countrytemplate_#{code}.html" )
|
23
|
+
|
24
|
+
## print first 600 chars
|
25
|
+
pp page.html[0..600]
|
26
|
+
|
27
|
+
## save for debuging
|
28
|
+
|
29
|
+
puts "saving a copy to #{code}.html for debugging"
|
30
|
+
File.open( "tmp/#{code}.html", 'w') do |f|
|
31
|
+
f.write( page.html )
|
32
|
+
end
|
33
|
+
|
34
|
+
h = page.data
|
35
|
+
pp h
|
36
|
+
|
37
|
+
### save to json
|
38
|
+
puts "saving a copy to #{code}.json for debugging"
|
39
|
+
File.open( "tmp/#{code}.json", 'w') do |f|
|
40
|
+
f.write( JSON.pretty_generate( h ) )
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
|
45
|
+
end # class TestJson
|
data/test/test_page.rb
ADDED
@@ -0,0 +1,227 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
|
4
|
+
require 'helper'
|
5
|
+
|
6
|
+
|
7
|
+
class TestPage < MiniTest::Unit::TestCase
|
8
|
+
|
9
|
+
def setup
|
10
|
+
Dir.mkdir( 'tmp' ) unless Dir.exists?( 'tmp' )
|
11
|
+
end
|
12
|
+
|
13
|
+
def test_br
|
14
|
+
page = Factbook::Page.new( 'br' )
|
15
|
+
|
16
|
+
page.html = File.read( "#{Factbook.root}/test/data/countrytemplate_br.html" )
|
17
|
+
|
18
|
+
## print first 600 chars
|
19
|
+
pp page.html[0..600]
|
20
|
+
|
21
|
+
## save for debuging
|
22
|
+
|
23
|
+
puts "saving a copy to br.html for debugging"
|
24
|
+
File.open( 'tmp/br.html', 'w') do |f|
|
25
|
+
f.write( page.html )
|
26
|
+
end
|
27
|
+
|
28
|
+
doc = page.doc
|
29
|
+
sects = page.sects
|
30
|
+
|
31
|
+
h = page.data
|
32
|
+
pp h
|
33
|
+
|
34
|
+
### save to json
|
35
|
+
puts "saving a copy to br.json for debugging"
|
36
|
+
File.open( 'tmp/br.json', 'w') do |f|
|
37
|
+
f.write( JSON.pretty_generate( h ) )
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
|
42
|
+
def xxx_test_br
|
43
|
+
page = Factbook::Page.new( 'br' )
|
44
|
+
|
45
|
+
## print first 600 chars
|
46
|
+
pp page.html[0..600]
|
47
|
+
|
48
|
+
## save for debuging
|
49
|
+
|
50
|
+
Dir.mkdir( 'tmp' ) unless Dir.exists?( 'tmp' )
|
51
|
+
puts "saving a copy to br.html for debugging"
|
52
|
+
File.open( 'tmp/br.html', 'w') do |f|
|
53
|
+
f.write( page.html )
|
54
|
+
end
|
55
|
+
|
56
|
+
doc = page.doc
|
57
|
+
sects = page.sects
|
58
|
+
|
59
|
+
rows = doc.css( 'table tr' )
|
60
|
+
field_ids = rows.css( '#field' ) ## check - use div#field.category -- possible?
|
61
|
+
data_ids = rows.css( '#data' )
|
62
|
+
|
63
|
+
puts "rows.size: #{rows.size} (field_ids.size: #{field_ids.size} / data_ids.size: #{data_ids.size})"
|
64
|
+
|
65
|
+
cats = rows.css( '.category' )
|
66
|
+
cats_div = rows.css( 'div.category' )
|
67
|
+
cats_span = rows.css( 'span.category' )
|
68
|
+
cats_other_size = cats.size - cats_div.size - cats_span.size
|
69
|
+
|
70
|
+
cats_data = rows.css( '.category_data' )
|
71
|
+
cats_div_data = rows.css( 'div.category_data' )
|
72
|
+
cats_span_data = rows.css( 'span.category_data' )
|
73
|
+
cats_other_data_size = cats_data.size - cats_div_data.size - cats_span_data.size
|
74
|
+
|
75
|
+
puts "cats.size: #{cats.size} (cats_div.size #{cats_div.size} / cats_span.size #{cats_span.size} / cats_other.size #{cats_other_size})"
|
76
|
+
puts "cats_data.size: #{cats_data.size} (cats_div_data.size #{cats_div_data.size} / cats_span_data.size #{cats_span_data.size} / cats_other_data.size #{cats_other_data_size})"
|
77
|
+
|
78
|
+
## some check for structure
|
79
|
+
if cats_other_size > 0
|
80
|
+
puts " ****!!!! category other (not div/span) found - #{cats_other_size}"
|
81
|
+
end
|
82
|
+
|
83
|
+
if cats_other_data_size > 0
|
84
|
+
puts " ****!!!! category_data other (not div/span) found - #{cats_other_data_size}"
|
85
|
+
end
|
86
|
+
|
87
|
+
## stats( doc )
|
88
|
+
|
89
|
+
sects.each_with_index do |sect,i|
|
90
|
+
puts ''
|
91
|
+
puts "############################"
|
92
|
+
puts "#### stats sect #{i}:"
|
93
|
+
pp page.sect_to_hash( sect )
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
|
98
|
+
def stats( doc )
|
99
|
+
rows = doc.css( 'table tr' )
|
100
|
+
cells = doc.css( 'table tr td' )
|
101
|
+
field_ids = rows.css( '#field' ) ## check - use div#field.category -- possible?
|
102
|
+
data_ids = rows.css( '#data' )
|
103
|
+
|
104
|
+
puts "rows.size: #{rows.size} (cells.size: #{cells.size} / field_ids.size: #{field_ids.size} / data_ids.size: #{data_ids.size})"
|
105
|
+
|
106
|
+
hash = {}
|
107
|
+
last_cat = nil
|
108
|
+
|
109
|
+
|
110
|
+
cells.each_with_index do |cell,i|
|
111
|
+
## next if i > 14 ## skip after xx for debugging for now
|
112
|
+
|
113
|
+
# check if field or data id
|
114
|
+
|
115
|
+
# check for (nested) div#field in td
|
116
|
+
has_field_id = cell.css( '#field' ).size == 1 ? true : false
|
117
|
+
|
118
|
+
# check for td#data
|
119
|
+
has_data_id = cell['id'] == 'data' ? true : false
|
120
|
+
|
121
|
+
if has_field_id
|
122
|
+
|
123
|
+
cats = cell.css( 'div.category' ) ## note: ignore all .category not using div (issue warn/err if found!!) etc.
|
124
|
+
if cats.size == 1
|
125
|
+
text = cats.first.text.strip # remove/strip leading and trailing spaces
|
126
|
+
last_cat = text
|
127
|
+
puts " [#{i}] category: >>#{text}<<"
|
128
|
+
else
|
129
|
+
puts "**** !!!!!! warn/err - found element w/ field id (no match for subsection!!! - check)"
|
130
|
+
puts cell.to_s
|
131
|
+
end
|
132
|
+
|
133
|
+
elsif has_data_id
|
134
|
+
|
135
|
+
cats = cell.css( 'div.category' ) ## note: ignore all .category not using div (issue warn/err if found!!) etc.
|
136
|
+
cats_data = cell.css( 'div.category_data,span.category_data' ) ## note: ignore a.category_data etc.
|
137
|
+
cats_div_data = cell.css( 'div.category_data' )
|
138
|
+
cats_span_data = cell.css( 'span.category_data' )
|
139
|
+
|
140
|
+
puts " - [#{i}] data cell - cats: #{cats.size}, cats_data: #{cats_data.size} (cats_div_data: #{cats_div_data.size} / cats_span_data: #{cats_span_data.size})"
|
141
|
+
|
142
|
+
pairs = []
|
143
|
+
last_pair = nil
|
144
|
+
last_pair_data_count = 0
|
145
|
+
|
146
|
+
## loop over div blocks (might be .category or .category_data)
|
147
|
+
cell.children.each_with_index do |child,j|
|
148
|
+
unless child.element?
|
149
|
+
## puts " **** !!!! skipping non-element type >#{child.type}<:"
|
150
|
+
## puts child.to_s
|
151
|
+
next
|
152
|
+
end
|
153
|
+
unless child.name == 'div'
|
154
|
+
puts " **** !!! skipping non-div >#{child.name}<:"
|
155
|
+
puts child.to_s
|
156
|
+
next
|
157
|
+
end
|
158
|
+
|
159
|
+
### check if .category or .category_data
|
160
|
+
if child['class'] == 'category'
|
161
|
+
|
162
|
+
## collect text for category; exclude element w/ class.category_data
|
163
|
+
text = ""
|
164
|
+
child.children.each do |subchild|
|
165
|
+
text << subchild.text.strip unless subchild.element? && subchild['class'] == 'category_data'
|
166
|
+
end
|
167
|
+
|
168
|
+
value = child.css('span.category_data').text.strip
|
169
|
+
|
170
|
+
puts " -- category >>#{text}<<"
|
171
|
+
|
172
|
+
## start new pair
|
173
|
+
last_pair = [ text, value ]
|
174
|
+
last_pair_data_count = 0
|
175
|
+
pairs << last_pair
|
176
|
+
|
177
|
+
elsif child['class'] == 'category_data'
|
178
|
+
puts " -- category_data"
|
179
|
+
|
180
|
+
text = child.text.strip
|
181
|
+
|
182
|
+
if last_pair.nil?
|
183
|
+
## assume its the very first entry; use implied/auto-created category
|
184
|
+
last_pair = [ 'text', '' ]
|
185
|
+
last_pair_data_count = 0
|
186
|
+
pairs << last_pair
|
187
|
+
end
|
188
|
+
|
189
|
+
### first category_data element?
|
190
|
+
if last_pair_data_count == 0
|
191
|
+
if last_pair[1] == ''
|
192
|
+
last_pair[1] = text
|
193
|
+
else
|
194
|
+
last_pair[1] += " #{text}" ## append w/o separator
|
195
|
+
end
|
196
|
+
else
|
197
|
+
last_pair[1] += "; #{text}" ## append with separator
|
198
|
+
end
|
199
|
+
last_pair_data_count += 1
|
200
|
+
|
201
|
+
else
|
202
|
+
puts " **** !!! skipping div w/o category or category_data class:"
|
203
|
+
puts child.to_s
|
204
|
+
end
|
205
|
+
end
|
206
|
+
|
207
|
+
## pp pairs
|
208
|
+
|
209
|
+
## pairs to hash
|
210
|
+
pairs_hash = {}
|
211
|
+
pairs.each do |pair|
|
212
|
+
pairs_hash[ pair[0] ] = pair[1]
|
213
|
+
end
|
214
|
+
|
215
|
+
hash[ last_cat ] = pairs_hash
|
216
|
+
|
217
|
+
else
|
218
|
+
puts "#### !!!! unknown cell type (no field or data id found):"
|
219
|
+
puts cell.to_s
|
220
|
+
end
|
221
|
+
end # each cell
|
222
|
+
|
223
|
+
pp hash
|
224
|
+
end # method stats
|
225
|
+
|
226
|
+
|
227
|
+
end # class TestPage
|
@@ -0,0 +1,290 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
|
4
|
+
require 'helper'
|
5
|
+
|
6
|
+
|
7
|
+
class TestPageOld < MiniTest::Unit::TestCase
|
8
|
+
|
9
|
+
def xxx_test_mx
|
10
|
+
page = Factbook::Page.new( 'mx' )
|
11
|
+
|
12
|
+
## print first 600 chars
|
13
|
+
pp page.html[0..600]
|
14
|
+
|
15
|
+
doc = page.doc
|
16
|
+
|
17
|
+
panels = doc.css( '.CollapsiblePanel' )
|
18
|
+
questions = doc.css( '.question' )
|
19
|
+
answers = doc.css( '.answer' )
|
20
|
+
|
21
|
+
puts "panels.size: #{panels.size}"
|
22
|
+
puts "questions.size: #{questions.size}"
|
23
|
+
puts "answers.size: #{answers.size}"
|
24
|
+
|
25
|
+
cats0 = panels[0].css( '.category' )
|
26
|
+
cats0_data = panels[0].css( '.category_data' )
|
27
|
+
|
28
|
+
puts "cats0.size: #{cats0.size}"
|
29
|
+
puts "cats0_data.size: #{cats0_data.size}"
|
30
|
+
|
31
|
+
cats1 = panels[1].css( '.category' )
|
32
|
+
cats1_data = panels[1].css( '.category_data' )
|
33
|
+
|
34
|
+
puts "cats1.size: #{cats1.size}"
|
35
|
+
puts "cats1_data.size: #{cats1_data.size}"
|
36
|
+
|
37
|
+
|
38
|
+
## fix: use cats -- add s
|
39
|
+
cat = doc.css( '#CollapsiblePanel1_Geo div.category' )
|
40
|
+
puts "cat.size: #{cat.size}"
|
41
|
+
|
42
|
+
catcheck = doc.css( '#CollapsiblePanel1_Geo .category' )
|
43
|
+
puts "catcheck.size: #{catcheck.size}"
|
44
|
+
|
45
|
+
catcheck2 = doc.css( '.category' )
|
46
|
+
puts "catcheck2.size: #{catcheck2.size}"
|
47
|
+
|
48
|
+
|
49
|
+
catdata = doc.css( '#CollapsiblePanel1_Geo .category_data' )
|
50
|
+
puts "catdata.size: #{catdata.size}"
|
51
|
+
|
52
|
+
catdatacheck2 = doc.css( '.category_data' )
|
53
|
+
puts "catdatacheck2.size: #{catdatacheck2.size}"
|
54
|
+
|
55
|
+
puts "catdata[0]:"
|
56
|
+
pp catdata[0]
|
57
|
+
|
58
|
+
puts "catdata[1]:"
|
59
|
+
pp catdata[1]
|
60
|
+
|
61
|
+
# puts "catdata[2]:"
|
62
|
+
# pp catdata[2]
|
63
|
+
|
64
|
+
# puts "catdata[0].text():"
|
65
|
+
# pp catdata[0].text()
|
66
|
+
|
67
|
+
# puts "cat[0].text():"
|
68
|
+
# pp cat[0].text()
|
69
|
+
|
70
|
+
# cat.each_with_index do |c,i|
|
71
|
+
# puts "[#{i+1}]: ========================="
|
72
|
+
# puts ">>#{c.text()}<<"
|
73
|
+
# end
|
74
|
+
|
75
|
+
end
|
76
|
+
|
77
|
+
def xxxx_test_mx
|
78
|
+
page = Factbook::Page.new( 'mx' )
|
79
|
+
|
80
|
+
## print first 600 chars
|
81
|
+
pp page.html[0..600]
|
82
|
+
|
83
|
+
## save for debuging
|
84
|
+
|
85
|
+
Dir.mkdir( 'tmp' ) unless Dir.exists?( 'tmp' )
|
86
|
+
puts "saving a copy to mx.html for debugging"
|
87
|
+
File.open( 'tmp/mx.html', 'w') do |f|
|
88
|
+
f.write( page.html )
|
89
|
+
end
|
90
|
+
|
91
|
+
doc = page.doc
|
92
|
+
sects = page.sects
|
93
|
+
|
94
|
+
panels = doc.css( '.CollapsiblePanel' )
|
95
|
+
questions = doc.css( '.question' )
|
96
|
+
answers = doc.css( '.answer' )
|
97
|
+
|
98
|
+
puts "panels.size: #{panels.size}"
|
99
|
+
puts "questions.size: #{questions.size}"
|
100
|
+
puts "answers.size: #{answers.size}"
|
101
|
+
|
102
|
+
rows_total = 0
|
103
|
+
panels.each_with_index do |panel,i|
|
104
|
+
rows = panel.css( 'table tr' )
|
105
|
+
puts " [#{i}] rows.size: #{rows.size}"
|
106
|
+
rows_total += rows.size
|
107
|
+
end
|
108
|
+
|
109
|
+
puts "rows_total: #{rows_total}"
|
110
|
+
|
111
|
+
rows = doc.css( 'table tr' )
|
112
|
+
field_ids = rows.css( '#field' ) ## check - use div#field.category -- possible?
|
113
|
+
data_ids = rows.css( '#data' )
|
114
|
+
|
115
|
+
puts "rows.size: #{rows.size} (field_ids.size: #{field_ids.size} / data_ids.size: #{data_ids.size})"
|
116
|
+
|
117
|
+
cats = rows.css( '.category' )
|
118
|
+
cats_div = rows.css( 'div.category' )
|
119
|
+
cats_span = rows.css( 'span.category' )
|
120
|
+
cats_other_size = cats.size - cats_div.size - cats_span.size
|
121
|
+
|
122
|
+
cats_data = rows.css( '.category_data' )
|
123
|
+
cats_div_data = rows.css( 'div.category_data' )
|
124
|
+
cats_span_data = rows.css( 'span.category_data' )
|
125
|
+
cats_other_data_size = cats_data.size - cats_div_data.size - cats_span_data.size
|
126
|
+
|
127
|
+
puts "cats.size: #{cats.size} (cats_div.size #{cats_div.size} / cats_span.size #{cats_span.size} / cats_other.size #{cats_other_size})"
|
128
|
+
puts "cats_data.size: #{cats_data.size} (cats_div_data.size #{cats_div_data.size} / cats_span_data.size #{cats_span_data.size} / cats_other_data.size #{cats_other_data_size})"
|
129
|
+
|
130
|
+
## some check for structure
|
131
|
+
if cats_other_size > 0
|
132
|
+
puts " ****!!!! category other (not div/span) found - #{cats_other_size}"
|
133
|
+
end
|
134
|
+
|
135
|
+
if cats_other_data_size > 0
|
136
|
+
puts " ****!!!! category_data other (not div/span) found - #{cats_other_data_size}"
|
137
|
+
end
|
138
|
+
|
139
|
+
## stats( doc )
|
140
|
+
|
141
|
+
sects.each_with_index do |sect,i|
|
142
|
+
puts ''
|
143
|
+
puts "############################"
|
144
|
+
puts "#### stats sect #{i}:"
|
145
|
+
stats( sect )
|
146
|
+
end
|
147
|
+
end
|
148
|
+
|
149
|
+
|
150
|
+
def stats( doc )
|
151
|
+
rows = doc.css( 'table tr' )
|
152
|
+
cells = doc.css( 'table tr td' )
|
153
|
+
field_ids = rows.css( '#field' ) ## check - use div#field.category -- possible?
|
154
|
+
data_ids = rows.css( '#data' )
|
155
|
+
|
156
|
+
puts "rows.size: #{rows.size} (cells.size: #{cells.size} / field_ids.size: #{field_ids.size} / data_ids.size: #{data_ids.size})"
|
157
|
+
|
158
|
+
|
159
|
+
## check rows
|
160
|
+
## todo/fix:
|
161
|
+
## loop over td's !!!
|
162
|
+
|
163
|
+
cells.each_with_index do |cell,i|
|
164
|
+
## next if i > 14 ## skip after xx for debugging for now
|
165
|
+
|
166
|
+
cats = cell.css( 'div.category' ) ## note: ignore all .category not using div (issue warn/err if found!!) etc.
|
167
|
+
cats_data = cell.css( 'div.category_data,span.category_data' ) ## note: ignore a.category_data etc.
|
168
|
+
cats_div_data = cell.css( 'div.category_data' )
|
169
|
+
cats_span_data = cell.css( 'span.category_data' )
|
170
|
+
|
171
|
+
field_ids = cell.css( '#field' ) ## td div.field check - use div#field.category -- possible?
|
172
|
+
|
173
|
+
### fix: split into #field and #data
|
174
|
+
## field has no category-data no sub/multiple categories etc.
|
175
|
+
|
176
|
+
## td#data
|
177
|
+
# quick hack: use parent() - fix!! check id for element if present and is data how?? e.g. cell['id'] == 'data' ???
|
178
|
+
data_ids = cell.parent.css( '#data' ) ## will include self? e.g. td id='data' ???
|
179
|
+
|
180
|
+
ids_size = field_ids.size + data_ids.size
|
181
|
+
|
182
|
+
if ids_size == 0
|
183
|
+
puts " ****!!!! no ids (field/data) found"
|
184
|
+
end
|
185
|
+
|
186
|
+
if ids_size > 1
|
187
|
+
puts " ***!!! more than one id (field/data) found - #{ids_size}"
|
188
|
+
end
|
189
|
+
|
190
|
+
|
191
|
+
## check for subcategory
|
192
|
+
## must be div w/ id field and class category
|
193
|
+
|
194
|
+
if field_ids.size == 1 ## assume category
|
195
|
+
|
196
|
+
if cats.size == 1 && cats_data.size == 0 && cats.first.name == 'div'
|
197
|
+
text = cats.first.text.strip # remove/strip leading and trailing spaces
|
198
|
+
puts " [#{i}] category: >>#{text}<<"
|
199
|
+
else
|
200
|
+
puts "**** !!!!!! warn/err - found element w/ field id (no match for subsection!!! - check)"
|
201
|
+
end
|
202
|
+
|
203
|
+
elsif data_ids.size == 1
|
204
|
+
|
205
|
+
if cats.size == 0
|
206
|
+
if cats_data.size == 1 ## check for cats_data.first.name == 'div' too ???
|
207
|
+
text = cats_data.first.text.strip # remove/strip leading and trailing spaces
|
208
|
+
puts " - [#{i}] data: >>#{text}<<"
|
209
|
+
elsif cats_data.size > 1 ## check for cats_data.first.name == 'div' too ???
|
210
|
+
ary = []
|
211
|
+
cats_data.each do |cat_data|
|
212
|
+
ary << cat_data.text.strip
|
213
|
+
end
|
214
|
+
text = ary.join( '; ' )
|
215
|
+
puts " - [#{i}] data#{cats_data.size}: >>#{text}<<"
|
216
|
+
else
|
217
|
+
# should not happen
|
218
|
+
puts "*** !!!! warn/err - skip empty data cell (no cats/no cats_data)"
|
219
|
+
end
|
220
|
+
elsif cats.size > 0
|
221
|
+
puts " [#{i}] cats: #{cats.size}, cats_data: #{cats_data.size} (cats_div_data: #{cats_div_data.size}/ cats_span_data: #{cats_span_data.size})"
|
222
|
+
|
223
|
+
|
224
|
+
## check for "free standing" data blocks (not assigned to category/key)
|
225
|
+
if cats_div_data.size > 1
|
226
|
+
if cats_div_data.size == 1 #
|
227
|
+
# check if first or last entry (if first entry use key *text*; otherwise use key *notes*)
|
228
|
+
else ## multiple (more than one) data divs
|
229
|
+
if cats.size == 1
|
230
|
+
# always assume text for now (not *notes*)
|
231
|
+
else
|
232
|
+
# multiple cats and multiple data divs (e.g. drinking water source:)
|
233
|
+
# to be done - for now use one all-in-one text blob
|
234
|
+
end
|
235
|
+
end
|
236
|
+
end
|
237
|
+
|
238
|
+
cats.each_with_index do |cat,j| # note: use index - j (for inner loop)
|
239
|
+
## get text from direct child / children
|
240
|
+
## do NOT included text from nested span - how? possible?
|
241
|
+
## text = cat.css( ':not( .category_data )' ).text.strip ## will it include text node(s)??
|
242
|
+
## text = cat.text.strip ## will it include text node(s)??
|
243
|
+
## text = cat.css( '*:not(.category_data)' ).text.strip
|
244
|
+
# Find the content of all child text nodes and join them together
|
245
|
+
|
246
|
+
## collect text for category; exclude element w/ class.category_data
|
247
|
+
text = ""
|
248
|
+
cat.children.each do |child|
|
249
|
+
text << child.text.strip unless child.element? && child['class'] == 'category_data'
|
250
|
+
end
|
251
|
+
|
252
|
+
## text = cat.xpath('text()').text.strip
|
253
|
+
|
254
|
+
n = cat.css( '.category_data' )
|
255
|
+
## or use
|
256
|
+
## text = cat.children.first.text ??
|
257
|
+
puts " -- [#{j}] subcategory: >>#{text}<< cats_data: #{n.size}"
|
258
|
+
## pp cat.css( '*:not(.category_data)' )
|
259
|
+
## pp cat.css( "*:not(*[@class='category_data'])" ) # *[@class='someclass']
|
260
|
+
## pp cat
|
261
|
+
## check if is div - if not issue warn
|
262
|
+
if cat.name == 'div'
|
263
|
+
## check if includes one or more category_data nodes
|
264
|
+
if n.size == 0
|
265
|
+
puts " ****** !!! no category_data inside"
|
266
|
+
end
|
267
|
+
if n.size > 1
|
268
|
+
puts " ****** !!! multiple category_data's inside - #{n.size}"
|
269
|
+
end
|
270
|
+
else
|
271
|
+
puts " ****** !!!! no div - is >>#{cat.name}<<"
|
272
|
+
end
|
273
|
+
end
|
274
|
+
else
|
275
|
+
puts "**** !!!!!! warn/err - found element w/ data id (no cats, no cats-data) [#{i}] cats: #{cats.size}, cats_data: #{cats_data.size}, data_ids: #{data_ids.size}"
|
276
|
+
end
|
277
|
+
else
|
278
|
+
puts "**** !!!!!!! [#{i}] cats: #{cats.size}, cats_data: #{cats_data.size}, field_ids: #{field_ids.size}, data_ids: #{data_ids.size}"
|
279
|
+
end
|
280
|
+
|
281
|
+
|
282
|
+
if cats.size > 1
|
283
|
+
## puts cell.to_s
|
284
|
+
end
|
285
|
+
end # each cell
|
286
|
+
|
287
|
+
end
|
288
|
+
|
289
|
+
|
290
|
+
end # class TestPageOld
|
data/test/test_strip.rb
ADDED
@@ -0,0 +1,66 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
|
4
|
+
require 'helper'
|
5
|
+
|
6
|
+
|
7
|
+
class TestStrip < MiniTest::Unit::TestCase
|
8
|
+
|
9
|
+
def test_country_comparison
|
10
|
+
|
11
|
+
html=<<EOS
|
12
|
+
|
13
|
+
<span class="category" style="padding-left:7px;">country comparison to the world:</span> <span class="category_data"> <a href="../rankorder/2147rank.html?countryname=Brazil&countrycode=br®ionCode=soa&rank=5#br" onMouseDown="" title="Country comparison to the world" alt="Country comparison to the world"> 5 </a> </span>
|
14
|
+
|
15
|
+
EOS
|
16
|
+
|
17
|
+
## note: need to escapce space!!!! e.g. use to\s the\s world etc.
|
18
|
+
## Note: To match whitespace in an x pattern use an escape such as \s or \p{Space}.
|
19
|
+
|
20
|
+
country_comparison_regex = /
|
21
|
+
<span \s class="category"[^>]*>
|
22
|
+
country \s comparison \s to \s the \s world:
|
23
|
+
<\/span>
|
24
|
+
\s*
|
25
|
+
<span \s class="category_data"[^>]*>
|
26
|
+
\s*
|
27
|
+
<a \s [^>]+>
|
28
|
+
.+?
|
29
|
+
<\/a>
|
30
|
+
\s*
|
31
|
+
<\/span>
|
32
|
+
/xm
|
33
|
+
|
34
|
+
country_comparison_space_regex = /
|
35
|
+
country \s comparison \s to \s the \s world:
|
36
|
+
/xm
|
37
|
+
|
38
|
+
country_comparison_span_regex = /
|
39
|
+
<span \s class="category"[^>]*>
|
40
|
+
/xm
|
41
|
+
|
42
|
+
country_comparison_cat_regex = /
|
43
|
+
<span \s class="category"[^>]*>
|
44
|
+
country \s comparison \s to \s the \s world:
|
45
|
+
<\/span>
|
46
|
+
/xm
|
47
|
+
|
48
|
+
|
49
|
+
m = country_comparison_space_regex.match( html )
|
50
|
+
pp m
|
51
|
+
assert m # must find a match
|
52
|
+
|
53
|
+
m = country_comparison_span_regex.match( html )
|
54
|
+
pp m
|
55
|
+
assert m # must find a match
|
56
|
+
|
57
|
+
m = country_comparison_cat_regex.match( html )
|
58
|
+
pp m
|
59
|
+
assert m # must find a match
|
60
|
+
|
61
|
+
m = country_comparison_regex.match( html )
|
62
|
+
pp m
|
63
|
+
assert m # must find a match
|
64
|
+
end
|
65
|
+
|
66
|
+
end # class TestStrip
|