factbook 0.0.1 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gemtest +0 -0
- data/Manifest.txt +10 -0
- data/README.md +24 -2
- data/Rakefile +2 -1
- data/lib/factbook/page.rb +408 -0
- data/lib/factbook/version.rb +1 -1
- data/lib/factbook.rb +4 -0
- data/test/data/countrytemplate_au.html +4179 -0
- data/test/data/countrytemplate_be.html +4260 -0
- data/test/data/countrytemplate_br.html +4366 -0
- data/test/data/countrytemplate_mx.html +4397 -0
- data/test/helper.rb +15 -0
- data/test/test_json.rb +45 -0
- data/test/test_page.rb +227 -0
- data/test/test_page_old.rb +290 -0
- data/test/test_strip.rb +66 -0
- metadata +37 -11
data/test/helper.rb
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
## $:.unshift(File.dirname(__FILE__))
|
4
|
+
|
5
|
+
## minitest setup
|
6
|
+
|
7
|
+
# require 'minitest/unit'
|
8
|
+
require 'minitest/autorun'
|
9
|
+
|
10
|
+
# include MiniTest::Unit # lets us use TestCase instead of MiniTest::Unit::TestCase
|
11
|
+
|
12
|
+
## our own code
|
13
|
+
|
14
|
+
require 'factbook'
|
15
|
+
|
data/test/test_json.rb
ADDED
@@ -0,0 +1,45 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
|
4
|
+
require 'helper'
|
5
|
+
|
6
|
+
|
7
|
+
class TestJson < MiniTest::Unit::TestCase
|
8
|
+
|
9
|
+
def setup
|
10
|
+
Dir.mkdir( 'tmp' ) unless Dir.exists?( 'tmp' )
|
11
|
+
end
|
12
|
+
|
13
|
+
def test_json
|
14
|
+
gen_json_for( 'au')
|
15
|
+
gen_json_for( 'be')
|
16
|
+
gen_json_for( 'br')
|
17
|
+
gen_json_for( 'mx')
|
18
|
+
end
|
19
|
+
|
20
|
+
def gen_json_for( code )
|
21
|
+
page = Factbook::Page.new( code )
|
22
|
+
page.html = File.read( "#{Factbook.root}/test/data/countrytemplate_#{code}.html" )
|
23
|
+
|
24
|
+
## print first 600 chars
|
25
|
+
pp page.html[0..600]
|
26
|
+
|
27
|
+
## save for debuging
|
28
|
+
|
29
|
+
puts "saving a copy to #{code}.html for debugging"
|
30
|
+
File.open( "tmp/#{code}.html", 'w') do |f|
|
31
|
+
f.write( page.html )
|
32
|
+
end
|
33
|
+
|
34
|
+
h = page.data
|
35
|
+
pp h
|
36
|
+
|
37
|
+
### save to json
|
38
|
+
puts "saving a copy to #{code}.json for debugging"
|
39
|
+
File.open( "tmp/#{code}.json", 'w') do |f|
|
40
|
+
f.write( JSON.pretty_generate( h ) )
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
|
45
|
+
end # class TestJson
|
data/test/test_page.rb
ADDED
@@ -0,0 +1,227 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
|
4
|
+
require 'helper'
|
5
|
+
|
6
|
+
|
7
|
+
class TestPage < MiniTest::Unit::TestCase
|
8
|
+
|
9
|
+
def setup
|
10
|
+
Dir.mkdir( 'tmp' ) unless Dir.exists?( 'tmp' )
|
11
|
+
end
|
12
|
+
|
13
|
+
def test_br
|
14
|
+
page = Factbook::Page.new( 'br' )
|
15
|
+
|
16
|
+
page.html = File.read( "#{Factbook.root}/test/data/countrytemplate_br.html" )
|
17
|
+
|
18
|
+
## print first 600 chars
|
19
|
+
pp page.html[0..600]
|
20
|
+
|
21
|
+
## save for debuging
|
22
|
+
|
23
|
+
puts "saving a copy to br.html for debugging"
|
24
|
+
File.open( 'tmp/br.html', 'w') do |f|
|
25
|
+
f.write( page.html )
|
26
|
+
end
|
27
|
+
|
28
|
+
doc = page.doc
|
29
|
+
sects = page.sects
|
30
|
+
|
31
|
+
h = page.data
|
32
|
+
pp h
|
33
|
+
|
34
|
+
### save to json
|
35
|
+
puts "saving a copy to br.json for debugging"
|
36
|
+
File.open( 'tmp/br.json', 'w') do |f|
|
37
|
+
f.write( JSON.pretty_generate( h ) )
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
|
42
|
+
def xxx_test_br
|
43
|
+
page = Factbook::Page.new( 'br' )
|
44
|
+
|
45
|
+
## print first 600 chars
|
46
|
+
pp page.html[0..600]
|
47
|
+
|
48
|
+
## save for debuging
|
49
|
+
|
50
|
+
Dir.mkdir( 'tmp' ) unless Dir.exists?( 'tmp' )
|
51
|
+
puts "saving a copy to br.html for debugging"
|
52
|
+
File.open( 'tmp/br.html', 'w') do |f|
|
53
|
+
f.write( page.html )
|
54
|
+
end
|
55
|
+
|
56
|
+
doc = page.doc
|
57
|
+
sects = page.sects
|
58
|
+
|
59
|
+
rows = doc.css( 'table tr' )
|
60
|
+
field_ids = rows.css( '#field' ) ## check - use div#field.category -- possible?
|
61
|
+
data_ids = rows.css( '#data' )
|
62
|
+
|
63
|
+
puts "rows.size: #{rows.size} (field_ids.size: #{field_ids.size} / data_ids.size: #{data_ids.size})"
|
64
|
+
|
65
|
+
cats = rows.css( '.category' )
|
66
|
+
cats_div = rows.css( 'div.category' )
|
67
|
+
cats_span = rows.css( 'span.category' )
|
68
|
+
cats_other_size = cats.size - cats_div.size - cats_span.size
|
69
|
+
|
70
|
+
cats_data = rows.css( '.category_data' )
|
71
|
+
cats_div_data = rows.css( 'div.category_data' )
|
72
|
+
cats_span_data = rows.css( 'span.category_data' )
|
73
|
+
cats_other_data_size = cats_data.size - cats_div_data.size - cats_span_data.size
|
74
|
+
|
75
|
+
puts "cats.size: #{cats.size} (cats_div.size #{cats_div.size} / cats_span.size #{cats_span.size} / cats_other.size #{cats_other_size})"
|
76
|
+
puts "cats_data.size: #{cats_data.size} (cats_div_data.size #{cats_div_data.size} / cats_span_data.size #{cats_span_data.size} / cats_other_data.size #{cats_other_data_size})"
|
77
|
+
|
78
|
+
## some check for structure
|
79
|
+
if cats_other_size > 0
|
80
|
+
puts " ****!!!! category other (not div/span) found - #{cats_other_size}"
|
81
|
+
end
|
82
|
+
|
83
|
+
if cats_other_data_size > 0
|
84
|
+
puts " ****!!!! category_data other (not div/span) found - #{cats_other_data_size}"
|
85
|
+
end
|
86
|
+
|
87
|
+
## stats( doc )
|
88
|
+
|
89
|
+
sects.each_with_index do |sect,i|
|
90
|
+
puts ''
|
91
|
+
puts "############################"
|
92
|
+
puts "#### stats sect #{i}:"
|
93
|
+
pp page.sect_to_hash( sect )
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
|
98
|
+
def stats( doc )
|
99
|
+
rows = doc.css( 'table tr' )
|
100
|
+
cells = doc.css( 'table tr td' )
|
101
|
+
field_ids = rows.css( '#field' ) ## check - use div#field.category -- possible?
|
102
|
+
data_ids = rows.css( '#data' )
|
103
|
+
|
104
|
+
puts "rows.size: #{rows.size} (cells.size: #{cells.size} / field_ids.size: #{field_ids.size} / data_ids.size: #{data_ids.size})"
|
105
|
+
|
106
|
+
hash = {}
|
107
|
+
last_cat = nil
|
108
|
+
|
109
|
+
|
110
|
+
cells.each_with_index do |cell,i|
|
111
|
+
## next if i > 14 ## skip after xx for debugging for now
|
112
|
+
|
113
|
+
# check if field or data id
|
114
|
+
|
115
|
+
# check for (nested) div#field in td
|
116
|
+
has_field_id = cell.css( '#field' ).size == 1 ? true : false
|
117
|
+
|
118
|
+
# check for td#data
|
119
|
+
has_data_id = cell['id'] == 'data' ? true : false
|
120
|
+
|
121
|
+
if has_field_id
|
122
|
+
|
123
|
+
cats = cell.css( 'div.category' ) ## note: ignore all .category not using div (issue warn/err if found!!) etc.
|
124
|
+
if cats.size == 1
|
125
|
+
text = cats.first.text.strip # remove/strip leading and trailing spaces
|
126
|
+
last_cat = text
|
127
|
+
puts " [#{i}] category: >>#{text}<<"
|
128
|
+
else
|
129
|
+
puts "**** !!!!!! warn/err - found element w/ field id (no match for subsection!!! - check)"
|
130
|
+
puts cell.to_s
|
131
|
+
end
|
132
|
+
|
133
|
+
elsif has_data_id
|
134
|
+
|
135
|
+
cats = cell.css( 'div.category' ) ## note: ignore all .category not using div (issue warn/err if found!!) etc.
|
136
|
+
cats_data = cell.css( 'div.category_data,span.category_data' ) ## note: ignore a.category_data etc.
|
137
|
+
cats_div_data = cell.css( 'div.category_data' )
|
138
|
+
cats_span_data = cell.css( 'span.category_data' )
|
139
|
+
|
140
|
+
puts " - [#{i}] data cell - cats: #{cats.size}, cats_data: #{cats_data.size} (cats_div_data: #{cats_div_data.size} / cats_span_data: #{cats_span_data.size})"
|
141
|
+
|
142
|
+
pairs = []
|
143
|
+
last_pair = nil
|
144
|
+
last_pair_data_count = 0
|
145
|
+
|
146
|
+
## loop over div blocks (might be .category or .category_data)
|
147
|
+
cell.children.each_with_index do |child,j|
|
148
|
+
unless child.element?
|
149
|
+
## puts " **** !!!! skipping non-element type >#{child.type}<:"
|
150
|
+
## puts child.to_s
|
151
|
+
next
|
152
|
+
end
|
153
|
+
unless child.name == 'div'
|
154
|
+
puts " **** !!! skipping non-div >#{child.name}<:"
|
155
|
+
puts child.to_s
|
156
|
+
next
|
157
|
+
end
|
158
|
+
|
159
|
+
### check if .category or .category_data
|
160
|
+
if child['class'] == 'category'
|
161
|
+
|
162
|
+
## collect text for category; exclude element w/ class.category_data
|
163
|
+
text = ""
|
164
|
+
child.children.each do |subchild|
|
165
|
+
text << subchild.text.strip unless subchild.element? && subchild['class'] == 'category_data'
|
166
|
+
end
|
167
|
+
|
168
|
+
value = child.css('span.category_data').text.strip
|
169
|
+
|
170
|
+
puts " -- category >>#{text}<<"
|
171
|
+
|
172
|
+
## start new pair
|
173
|
+
last_pair = [ text, value ]
|
174
|
+
last_pair_data_count = 0
|
175
|
+
pairs << last_pair
|
176
|
+
|
177
|
+
elsif child['class'] == 'category_data'
|
178
|
+
puts " -- category_data"
|
179
|
+
|
180
|
+
text = child.text.strip
|
181
|
+
|
182
|
+
if last_pair.nil?
|
183
|
+
## assume its the very first entry; use implied/auto-created category
|
184
|
+
last_pair = [ 'text', '' ]
|
185
|
+
last_pair_data_count = 0
|
186
|
+
pairs << last_pair
|
187
|
+
end
|
188
|
+
|
189
|
+
### first category_data element?
|
190
|
+
if last_pair_data_count == 0
|
191
|
+
if last_pair[1] == ''
|
192
|
+
last_pair[1] = text
|
193
|
+
else
|
194
|
+
last_pair[1] += " #{text}" ## append w/o separator
|
195
|
+
end
|
196
|
+
else
|
197
|
+
last_pair[1] += "; #{text}" ## append with separator
|
198
|
+
end
|
199
|
+
last_pair_data_count += 1
|
200
|
+
|
201
|
+
else
|
202
|
+
puts " **** !!! skipping div w/o category or category_data class:"
|
203
|
+
puts child.to_s
|
204
|
+
end
|
205
|
+
end
|
206
|
+
|
207
|
+
## pp pairs
|
208
|
+
|
209
|
+
## pairs to hash
|
210
|
+
pairs_hash = {}
|
211
|
+
pairs.each do |pair|
|
212
|
+
pairs_hash[ pair[0] ] = pair[1]
|
213
|
+
end
|
214
|
+
|
215
|
+
hash[ last_cat ] = pairs_hash
|
216
|
+
|
217
|
+
else
|
218
|
+
puts "#### !!!! unknown cell type (no field or data id found):"
|
219
|
+
puts cell.to_s
|
220
|
+
end
|
221
|
+
end # each cell
|
222
|
+
|
223
|
+
pp hash
|
224
|
+
end # method stats
|
225
|
+
|
226
|
+
|
227
|
+
end # class TestPage
|
@@ -0,0 +1,290 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
|
4
|
+
require 'helper'
|
5
|
+
|
6
|
+
|
7
|
+
class TestPageOld < MiniTest::Unit::TestCase
|
8
|
+
|
9
|
+
def xxx_test_mx
|
10
|
+
page = Factbook::Page.new( 'mx' )
|
11
|
+
|
12
|
+
## print first 600 chars
|
13
|
+
pp page.html[0..600]
|
14
|
+
|
15
|
+
doc = page.doc
|
16
|
+
|
17
|
+
panels = doc.css( '.CollapsiblePanel' )
|
18
|
+
questions = doc.css( '.question' )
|
19
|
+
answers = doc.css( '.answer' )
|
20
|
+
|
21
|
+
puts "panels.size: #{panels.size}"
|
22
|
+
puts "questions.size: #{questions.size}"
|
23
|
+
puts "answers.size: #{answers.size}"
|
24
|
+
|
25
|
+
cats0 = panels[0].css( '.category' )
|
26
|
+
cats0_data = panels[0].css( '.category_data' )
|
27
|
+
|
28
|
+
puts "cats0.size: #{cats0.size}"
|
29
|
+
puts "cats0_data.size: #{cats0_data.size}"
|
30
|
+
|
31
|
+
cats1 = panels[1].css( '.category' )
|
32
|
+
cats1_data = panels[1].css( '.category_data' )
|
33
|
+
|
34
|
+
puts "cats1.size: #{cats1.size}"
|
35
|
+
puts "cats1_data.size: #{cats1_data.size}"
|
36
|
+
|
37
|
+
|
38
|
+
## fix: use cats -- add s
|
39
|
+
cat = doc.css( '#CollapsiblePanel1_Geo div.category' )
|
40
|
+
puts "cat.size: #{cat.size}"
|
41
|
+
|
42
|
+
catcheck = doc.css( '#CollapsiblePanel1_Geo .category' )
|
43
|
+
puts "catcheck.size: #{catcheck.size}"
|
44
|
+
|
45
|
+
catcheck2 = doc.css( '.category' )
|
46
|
+
puts "catcheck2.size: #{catcheck2.size}"
|
47
|
+
|
48
|
+
|
49
|
+
catdata = doc.css( '#CollapsiblePanel1_Geo .category_data' )
|
50
|
+
puts "catdata.size: #{catdata.size}"
|
51
|
+
|
52
|
+
catdatacheck2 = doc.css( '.category_data' )
|
53
|
+
puts "catdatacheck2.size: #{catdatacheck2.size}"
|
54
|
+
|
55
|
+
puts "catdata[0]:"
|
56
|
+
pp catdata[0]
|
57
|
+
|
58
|
+
puts "catdata[1]:"
|
59
|
+
pp catdata[1]
|
60
|
+
|
61
|
+
# puts "catdata[2]:"
|
62
|
+
# pp catdata[2]
|
63
|
+
|
64
|
+
# puts "catdata[0].text():"
|
65
|
+
# pp catdata[0].text()
|
66
|
+
|
67
|
+
# puts "cat[0].text():"
|
68
|
+
# pp cat[0].text()
|
69
|
+
|
70
|
+
# cat.each_with_index do |c,i|
|
71
|
+
# puts "[#{i+1}]: ========================="
|
72
|
+
# puts ">>#{c.text()}<<"
|
73
|
+
# end
|
74
|
+
|
75
|
+
end
|
76
|
+
|
77
|
+
def xxxx_test_mx
|
78
|
+
page = Factbook::Page.new( 'mx' )
|
79
|
+
|
80
|
+
## print first 600 chars
|
81
|
+
pp page.html[0..600]
|
82
|
+
|
83
|
+
## save for debuging
|
84
|
+
|
85
|
+
Dir.mkdir( 'tmp' ) unless Dir.exists?( 'tmp' )
|
86
|
+
puts "saving a copy to mx.html for debugging"
|
87
|
+
File.open( 'tmp/mx.html', 'w') do |f|
|
88
|
+
f.write( page.html )
|
89
|
+
end
|
90
|
+
|
91
|
+
doc = page.doc
|
92
|
+
sects = page.sects
|
93
|
+
|
94
|
+
panels = doc.css( '.CollapsiblePanel' )
|
95
|
+
questions = doc.css( '.question' )
|
96
|
+
answers = doc.css( '.answer' )
|
97
|
+
|
98
|
+
puts "panels.size: #{panels.size}"
|
99
|
+
puts "questions.size: #{questions.size}"
|
100
|
+
puts "answers.size: #{answers.size}"
|
101
|
+
|
102
|
+
rows_total = 0
|
103
|
+
panels.each_with_index do |panel,i|
|
104
|
+
rows = panel.css( 'table tr' )
|
105
|
+
puts " [#{i}] rows.size: #{rows.size}"
|
106
|
+
rows_total += rows.size
|
107
|
+
end
|
108
|
+
|
109
|
+
puts "rows_total: #{rows_total}"
|
110
|
+
|
111
|
+
rows = doc.css( 'table tr' )
|
112
|
+
field_ids = rows.css( '#field' ) ## check - use div#field.category -- possible?
|
113
|
+
data_ids = rows.css( '#data' )
|
114
|
+
|
115
|
+
puts "rows.size: #{rows.size} (field_ids.size: #{field_ids.size} / data_ids.size: #{data_ids.size})"
|
116
|
+
|
117
|
+
cats = rows.css( '.category' )
|
118
|
+
cats_div = rows.css( 'div.category' )
|
119
|
+
cats_span = rows.css( 'span.category' )
|
120
|
+
cats_other_size = cats.size - cats_div.size - cats_span.size
|
121
|
+
|
122
|
+
cats_data = rows.css( '.category_data' )
|
123
|
+
cats_div_data = rows.css( 'div.category_data' )
|
124
|
+
cats_span_data = rows.css( 'span.category_data' )
|
125
|
+
cats_other_data_size = cats_data.size - cats_div_data.size - cats_span_data.size
|
126
|
+
|
127
|
+
puts "cats.size: #{cats.size} (cats_div.size #{cats_div.size} / cats_span.size #{cats_span.size} / cats_other.size #{cats_other_size})"
|
128
|
+
puts "cats_data.size: #{cats_data.size} (cats_div_data.size #{cats_div_data.size} / cats_span_data.size #{cats_span_data.size} / cats_other_data.size #{cats_other_data_size})"
|
129
|
+
|
130
|
+
## some check for structure
|
131
|
+
if cats_other_size > 0
|
132
|
+
puts " ****!!!! category other (not div/span) found - #{cats_other_size}"
|
133
|
+
end
|
134
|
+
|
135
|
+
if cats_other_data_size > 0
|
136
|
+
puts " ****!!!! category_data other (not div/span) found - #{cats_other_data_size}"
|
137
|
+
end
|
138
|
+
|
139
|
+
## stats( doc )
|
140
|
+
|
141
|
+
sects.each_with_index do |sect,i|
|
142
|
+
puts ''
|
143
|
+
puts "############################"
|
144
|
+
puts "#### stats sect #{i}:"
|
145
|
+
stats( sect )
|
146
|
+
end
|
147
|
+
end
|
148
|
+
|
149
|
+
|
150
|
+
def stats( doc )
|
151
|
+
rows = doc.css( 'table tr' )
|
152
|
+
cells = doc.css( 'table tr td' )
|
153
|
+
field_ids = rows.css( '#field' ) ## check - use div#field.category -- possible?
|
154
|
+
data_ids = rows.css( '#data' )
|
155
|
+
|
156
|
+
puts "rows.size: #{rows.size} (cells.size: #{cells.size} / field_ids.size: #{field_ids.size} / data_ids.size: #{data_ids.size})"
|
157
|
+
|
158
|
+
|
159
|
+
## check rows
|
160
|
+
## todo/fix:
|
161
|
+
## loop over td's !!!
|
162
|
+
|
163
|
+
cells.each_with_index do |cell,i|
|
164
|
+
## next if i > 14 ## skip after xx for debugging for now
|
165
|
+
|
166
|
+
cats = cell.css( 'div.category' ) ## note: ignore all .category not using div (issue warn/err if found!!) etc.
|
167
|
+
cats_data = cell.css( 'div.category_data,span.category_data' ) ## note: ignore a.category_data etc.
|
168
|
+
cats_div_data = cell.css( 'div.category_data' )
|
169
|
+
cats_span_data = cell.css( 'span.category_data' )
|
170
|
+
|
171
|
+
field_ids = cell.css( '#field' ) ## td div.field check - use div#field.category -- possible?
|
172
|
+
|
173
|
+
### fix: split into #field and #data
|
174
|
+
## field has no category-data no sub/multiple categories etc.
|
175
|
+
|
176
|
+
## td#data
|
177
|
+
# quick hack: use parent() - fix!! check id for element if present and is data how?? e.g. cell['id'] == 'data' ???
|
178
|
+
data_ids = cell.parent.css( '#data' ) ## will include self? e.g. td id='data' ???
|
179
|
+
|
180
|
+
ids_size = field_ids.size + data_ids.size
|
181
|
+
|
182
|
+
if ids_size == 0
|
183
|
+
puts " ****!!!! no ids (field/data) found"
|
184
|
+
end
|
185
|
+
|
186
|
+
if ids_size > 1
|
187
|
+
puts " ***!!! more than one id (field/data) found - #{ids_size}"
|
188
|
+
end
|
189
|
+
|
190
|
+
|
191
|
+
## check for subcategory
|
192
|
+
## must be div w/ id field and class category
|
193
|
+
|
194
|
+
if field_ids.size == 1 ## assume category
|
195
|
+
|
196
|
+
if cats.size == 1 && cats_data.size == 0 && cats.first.name == 'div'
|
197
|
+
text = cats.first.text.strip # remove/strip leading and trailing spaces
|
198
|
+
puts " [#{i}] category: >>#{text}<<"
|
199
|
+
else
|
200
|
+
puts "**** !!!!!! warn/err - found element w/ field id (no match for subsection!!! - check)"
|
201
|
+
end
|
202
|
+
|
203
|
+
elsif data_ids.size == 1
|
204
|
+
|
205
|
+
if cats.size == 0
|
206
|
+
if cats_data.size == 1 ## check for cats_data.first.name == 'div' too ???
|
207
|
+
text = cats_data.first.text.strip # remove/strip leading and trailing spaces
|
208
|
+
puts " - [#{i}] data: >>#{text}<<"
|
209
|
+
elsif cats_data.size > 1 ## check for cats_data.first.name == 'div' too ???
|
210
|
+
ary = []
|
211
|
+
cats_data.each do |cat_data|
|
212
|
+
ary << cat_data.text.strip
|
213
|
+
end
|
214
|
+
text = ary.join( '; ' )
|
215
|
+
puts " - [#{i}] data#{cats_data.size}: >>#{text}<<"
|
216
|
+
else
|
217
|
+
# should not happen
|
218
|
+
puts "*** !!!! warn/err - skip empty data cell (no cats/no cats_data)"
|
219
|
+
end
|
220
|
+
elsif cats.size > 0
|
221
|
+
puts " [#{i}] cats: #{cats.size}, cats_data: #{cats_data.size} (cats_div_data: #{cats_div_data.size}/ cats_span_data: #{cats_span_data.size})"
|
222
|
+
|
223
|
+
|
224
|
+
## check for "free standing" data blocks (not assigned to category/key)
|
225
|
+
if cats_div_data.size > 1
|
226
|
+
if cats_div_data.size == 1 #
|
227
|
+
# check if first or last entry (if first entry use key *text*; otherwise use key *notes*)
|
228
|
+
else ## multiple (more than one) data divs
|
229
|
+
if cats.size == 1
|
230
|
+
# always assume text for now (not *notes*)
|
231
|
+
else
|
232
|
+
# multiple cats and multiple data divs (e.g. drinking water source:)
|
233
|
+
# to be done - for now use one all-in-one text blob
|
234
|
+
end
|
235
|
+
end
|
236
|
+
end
|
237
|
+
|
238
|
+
cats.each_with_index do |cat,j| # note: use index - j (for inner loop)
|
239
|
+
## get text from direct child / children
|
240
|
+
## do NOT included text from nested span - how? possible?
|
241
|
+
## text = cat.css( ':not( .category_data )' ).text.strip ## will it include text node(s)??
|
242
|
+
## text = cat.text.strip ## will it include text node(s)??
|
243
|
+
## text = cat.css( '*:not(.category_data)' ).text.strip
|
244
|
+
# Find the content of all child text nodes and join them together
|
245
|
+
|
246
|
+
## collect text for category; exclude element w/ class.category_data
|
247
|
+
text = ""
|
248
|
+
cat.children.each do |child|
|
249
|
+
text << child.text.strip unless child.element? && child['class'] == 'category_data'
|
250
|
+
end
|
251
|
+
|
252
|
+
## text = cat.xpath('text()').text.strip
|
253
|
+
|
254
|
+
n = cat.css( '.category_data' )
|
255
|
+
## or use
|
256
|
+
## text = cat.children.first.text ??
|
257
|
+
puts " -- [#{j}] subcategory: >>#{text}<< cats_data: #{n.size}"
|
258
|
+
## pp cat.css( '*:not(.category_data)' )
|
259
|
+
## pp cat.css( "*:not(*[@class='category_data'])" ) # *[@class='someclass']
|
260
|
+
## pp cat
|
261
|
+
## check if is div - if not issue warn
|
262
|
+
if cat.name == 'div'
|
263
|
+
## check if includes one or more category_data nodes
|
264
|
+
if n.size == 0
|
265
|
+
puts " ****** !!! no category_data inside"
|
266
|
+
end
|
267
|
+
if n.size > 1
|
268
|
+
puts " ****** !!! multiple category_data's inside - #{n.size}"
|
269
|
+
end
|
270
|
+
else
|
271
|
+
puts " ****** !!!! no div - is >>#{cat.name}<<"
|
272
|
+
end
|
273
|
+
end
|
274
|
+
else
|
275
|
+
puts "**** !!!!!! warn/err - found element w/ data id (no cats, no cats-data) [#{i}] cats: #{cats.size}, cats_data: #{cats_data.size}, data_ids: #{data_ids.size}"
|
276
|
+
end
|
277
|
+
else
|
278
|
+
puts "**** !!!!!!! [#{i}] cats: #{cats.size}, cats_data: #{cats_data.size}, field_ids: #{field_ids.size}, data_ids: #{data_ids.size}"
|
279
|
+
end
|
280
|
+
|
281
|
+
|
282
|
+
if cats.size > 1
|
283
|
+
## puts cell.to_s
|
284
|
+
end
|
285
|
+
end # each cell
|
286
|
+
|
287
|
+
end
|
288
|
+
|
289
|
+
|
290
|
+
end # class TestPageOld
|
data/test/test_strip.rb
ADDED
@@ -0,0 +1,66 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
|
4
|
+
require 'helper'
|
5
|
+
|
6
|
+
|
7
|
+
class TestStrip < MiniTest::Unit::TestCase
|
8
|
+
|
9
|
+
def test_country_comparison
|
10
|
+
|
11
|
+
html=<<EOS
|
12
|
+
|
13
|
+
<span class="category" style="padding-left:7px;">country comparison to the world:</span> <span class="category_data"> <a href="../rankorder/2147rank.html?countryname=Brazil&countrycode=br®ionCode=soa&rank=5#br" onMouseDown="" title="Country comparison to the world" alt="Country comparison to the world"> 5 </a> </span>
|
14
|
+
|
15
|
+
EOS
|
16
|
+
|
17
|
+
## note: need to escapce space!!!! e.g. use to\s the\s world etc.
|
18
|
+
## Note: To match whitespace in an x pattern use an escape such as \s or \p{Space}.
|
19
|
+
|
20
|
+
country_comparison_regex = /
|
21
|
+
<span \s class="category"[^>]*>
|
22
|
+
country \s comparison \s to \s the \s world:
|
23
|
+
<\/span>
|
24
|
+
\s*
|
25
|
+
<span \s class="category_data"[^>]*>
|
26
|
+
\s*
|
27
|
+
<a \s [^>]+>
|
28
|
+
.+?
|
29
|
+
<\/a>
|
30
|
+
\s*
|
31
|
+
<\/span>
|
32
|
+
/xm
|
33
|
+
|
34
|
+
country_comparison_space_regex = /
|
35
|
+
country \s comparison \s to \s the \s world:
|
36
|
+
/xm
|
37
|
+
|
38
|
+
country_comparison_span_regex = /
|
39
|
+
<span \s class="category"[^>]*>
|
40
|
+
/xm
|
41
|
+
|
42
|
+
country_comparison_cat_regex = /
|
43
|
+
<span \s class="category"[^>]*>
|
44
|
+
country \s comparison \s to \s the \s world:
|
45
|
+
<\/span>
|
46
|
+
/xm
|
47
|
+
|
48
|
+
|
49
|
+
m = country_comparison_space_regex.match( html )
|
50
|
+
pp m
|
51
|
+
assert m # must find a match
|
52
|
+
|
53
|
+
m = country_comparison_span_regex.match( html )
|
54
|
+
pp m
|
55
|
+
assert m # must find a match
|
56
|
+
|
57
|
+
m = country_comparison_cat_regex.match( html )
|
58
|
+
pp m
|
59
|
+
assert m # must find a match
|
60
|
+
|
61
|
+
m = country_comparison_regex.match( html )
|
62
|
+
pp m
|
63
|
+
assert m # must find a match
|
64
|
+
end
|
65
|
+
|
66
|
+
end # class TestStrip
|