factbook 0.1.1 → 0.1.2

Sign up to get free protection for your applications and to get access to all the features.
data/test/test_json.rb CHANGED
@@ -6,40 +6,42 @@ require 'helper'
6
6
 
7
7
  class TestJson < MiniTest::Unit::TestCase
8
8
 
9
- def setup
10
- Dir.mkdir( 'tmp' ) unless Dir.exists?( 'tmp' )
11
- end
12
9
 
13
10
  def test_json
14
- gen_json_for( 'au')
15
- gen_json_for( 'be')
16
- gen_json_for( 'br')
17
- gen_json_for( 'mx')
18
- end
19
-
20
- def gen_json_for( code )
21
- page = Factbook::Page.new( code )
22
- page.html = File.read( "#{Factbook.root}/test/data/countrytemplate_#{code}.html" )
23
-
24
- ## print first 600 chars
25
- pp page.html[0..600]
26
-
27
- ## save for debuging
11
+ Dir.mkdir( 'tmp' ) unless Dir.exists?( 'tmp' )
12
+
13
+ codes = [ 'au',
14
+ 'be',
15
+ 'br',
16
+ 'mx',
17
+ 'ls',
18
+ 'vt',
19
+ 'ee',
20
+ 'xx' ]
21
+
22
+ codes.each do |code|
23
+ page = Factbook::Page.new( code )
24
+ page.html = File.read( "#{Factbook.root}/test/data/countrytemplate_#{code}.html" )
25
+
26
+ ## print first 600 chars
27
+ pp page.html[0..600]
28
+
29
+ ## save for debuging
28
30
 
29
- puts "saving a copy to #{code}.html for debugging"
30
- File.open( "tmp/#{code}.html", 'w') do |f|
31
- f.write( page.html )
32
- end
31
+ puts "saving a copy to #{code}.html for debugging"
32
+ File.open( "tmp/#{code}.html", 'w' ) do |f|
33
+ f.write page.html
34
+ end
33
35
 
34
- h = page.data
35
- pp h
36
+ h = page.data
37
+ pp h
36
38
 
37
- ### save to json
38
- puts "saving a copy to #{code}.json for debugging"
39
- File.open( "tmp/#{code}.json", 'w') do |f|
40
- f.write( JSON.pretty_generate( h ) )
41
- end
39
+ ### save to json
40
+ puts "saving a copy to #{code}.json for debugging"
41
+ File.open( "tmp/#{code}.json", 'w' ) do |f|
42
+ f.write JSON.pretty_generate( h )
43
+ end
44
+ end
42
45
  end
43
46
 
44
-
45
47
  end # class TestJson
data/test/test_page.rb CHANGED
@@ -6,222 +6,31 @@ require 'helper'
6
6
 
7
7
  class TestPage < MiniTest::Unit::TestCase
8
8
 
9
- def setup
10
- Dir.mkdir( 'tmp' ) unless Dir.exists?( 'tmp' )
11
- end
12
9
 
13
- def test_br
14
- page = Factbook::Page.new( 'br' )
10
+ def test_sects
11
+ pages = [
12
+ [ 'au', 10 ],
13
+ [ 'be', 10 ],
14
+ [ 'br', 10 ],
15
+ [ 'ee', 10 ],
16
+ [ 'mx', 10 ],
17
+ [ 'xx', 10 ],
18
+ [ 'ls', 9 ],
19
+ [ 'vt', 8 ]]
15
20
 
16
- page.html = File.read( "#{Factbook.root}/test/data/countrytemplate_br.html" )
21
+ pages.each do |rec|
22
+ code = rec[0]
23
+ sects_size = rec[1]
17
24
 
18
- ## print first 600 chars
19
- pp page.html[0..600]
20
-
21
- ## save for debuging
22
-
23
- puts "saving a copy to br.html for debugging"
24
- File.open( 'tmp/br.html', 'w') do |f|
25
- f.write( page.html )
26
- end
25
+ page = Factbook::Page.new( code )
26
+ page.html = File.read( "#{Factbook.root}/test/data/countrytemplate_#{code}.html" )
27
27
 
28
- doc = page.doc
29
- sects = page.sects
28
+ ## print first 600 chars
29
+ pp page.html[0..600]
30
30
 
31
- h = page.data
32
- pp h
33
-
34
- ### save to json
35
- puts "saving a copy to br.json for debugging"
36
- File.open( 'tmp/br.json', 'w') do |f|
37
- f.write( JSON.pretty_generate( h ) )
31
+ assert_equal sects_size, page.sects.size
38
32
  end
39
33
  end
40
34
 
41
35
 
42
- def xxx_test_br
43
- page = Factbook::Page.new( 'br' )
44
-
45
- ## print first 600 chars
46
- pp page.html[0..600]
47
-
48
- ## save for debuging
49
-
50
- Dir.mkdir( 'tmp' ) unless Dir.exists?( 'tmp' )
51
- puts "saving a copy to br.html for debugging"
52
- File.open( 'tmp/br.html', 'w') do |f|
53
- f.write( page.html )
54
- end
55
-
56
- doc = page.doc
57
- sects = page.sects
58
-
59
- rows = doc.css( 'table tr' )
60
- field_ids = rows.css( '#field' ) ## check - use div#field.category -- possible?
61
- data_ids = rows.css( '#data' )
62
-
63
- puts "rows.size: #{rows.size} (field_ids.size: #{field_ids.size} / data_ids.size: #{data_ids.size})"
64
-
65
- cats = rows.css( '.category' )
66
- cats_div = rows.css( 'div.category' )
67
- cats_span = rows.css( 'span.category' )
68
- cats_other_size = cats.size - cats_div.size - cats_span.size
69
-
70
- cats_data = rows.css( '.category_data' )
71
- cats_div_data = rows.css( 'div.category_data' )
72
- cats_span_data = rows.css( 'span.category_data' )
73
- cats_other_data_size = cats_data.size - cats_div_data.size - cats_span_data.size
74
-
75
- puts "cats.size: #{cats.size} (cats_div.size #{cats_div.size} / cats_span.size #{cats_span.size} / cats_other.size #{cats_other_size})"
76
- puts "cats_data.size: #{cats_data.size} (cats_div_data.size #{cats_div_data.size} / cats_span_data.size #{cats_span_data.size} / cats_other_data.size #{cats_other_data_size})"
77
-
78
- ## some check for structure
79
- if cats_other_size > 0
80
- puts " ****!!!! category other (not div/span) found - #{cats_other_size}"
81
- end
82
-
83
- if cats_other_data_size > 0
84
- puts " ****!!!! category_data other (not div/span) found - #{cats_other_data_size}"
85
- end
86
-
87
- ## stats( doc )
88
-
89
- sects.each_with_index do |sect,i|
90
- puts ''
91
- puts "############################"
92
- puts "#### stats sect #{i}:"
93
- pp page.sect_to_hash( sect )
94
- end
95
- end
96
-
97
-
98
- def stats( doc )
99
- rows = doc.css( 'table tr' )
100
- cells = doc.css( 'table tr td' )
101
- field_ids = rows.css( '#field' ) ## check - use div#field.category -- possible?
102
- data_ids = rows.css( '#data' )
103
-
104
- puts "rows.size: #{rows.size} (cells.size: #{cells.size} / field_ids.size: #{field_ids.size} / data_ids.size: #{data_ids.size})"
105
-
106
- hash = {}
107
- last_cat = nil
108
-
109
-
110
- cells.each_with_index do |cell,i|
111
- ## next if i > 14 ## skip after xx for debugging for now
112
-
113
- # check if field or data id
114
-
115
- # check for (nested) div#field in td
116
- has_field_id = cell.css( '#field' ).size == 1 ? true : false
117
-
118
- # check for td#data
119
- has_data_id = cell['id'] == 'data' ? true : false
120
-
121
- if has_field_id
122
-
123
- cats = cell.css( 'div.category' ) ## note: ignore all .category not using div (issue warn/err if found!!) etc.
124
- if cats.size == 1
125
- text = cats.first.text.strip # remove/strip leading and trailing spaces
126
- last_cat = text
127
- puts " [#{i}] category: >>#{text}<<"
128
- else
129
- puts "**** !!!!!! warn/err - found element w/ field id (no match for subsection!!! - check)"
130
- puts cell.to_s
131
- end
132
-
133
- elsif has_data_id
134
-
135
- cats = cell.css( 'div.category' ) ## note: ignore all .category not using div (issue warn/err if found!!) etc.
136
- cats_data = cell.css( 'div.category_data,span.category_data' ) ## note: ignore a.category_data etc.
137
- cats_div_data = cell.css( 'div.category_data' )
138
- cats_span_data = cell.css( 'span.category_data' )
139
-
140
- puts " - [#{i}] data cell - cats: #{cats.size}, cats_data: #{cats_data.size} (cats_div_data: #{cats_div_data.size} / cats_span_data: #{cats_span_data.size})"
141
-
142
- pairs = []
143
- last_pair = nil
144
- last_pair_data_count = 0
145
-
146
- ## loop over div blocks (might be .category or .category_data)
147
- cell.children.each_with_index do |child,j|
148
- unless child.element?
149
- ## puts " **** !!!! skipping non-element type >#{child.type}<:"
150
- ## puts child.to_s
151
- next
152
- end
153
- unless child.name == 'div'
154
- puts " **** !!! skipping non-div >#{child.name}<:"
155
- puts child.to_s
156
- next
157
- end
158
-
159
- ### check if .category or .category_data
160
- if child['class'] == 'category'
161
-
162
- ## collect text for category; exclude element w/ class.category_data
163
- text = ""
164
- child.children.each do |subchild|
165
- text << subchild.text.strip unless subchild.element? && subchild['class'] == 'category_data'
166
- end
167
-
168
- value = child.css('span.category_data').text.strip
169
-
170
- puts " -- category >>#{text}<<"
171
-
172
- ## start new pair
173
- last_pair = [ text, value ]
174
- last_pair_data_count = 0
175
- pairs << last_pair
176
-
177
- elsif child['class'] == 'category_data'
178
- puts " -- category_data"
179
-
180
- text = child.text.strip
181
-
182
- if last_pair.nil?
183
- ## assume its the very first entry; use implied/auto-created category
184
- last_pair = [ 'text', '' ]
185
- last_pair_data_count = 0
186
- pairs << last_pair
187
- end
188
-
189
- ### first category_data element?
190
- if last_pair_data_count == 0
191
- if last_pair[1] == ''
192
- last_pair[1] = text
193
- else
194
- last_pair[1] += " #{text}" ## append w/o separator
195
- end
196
- else
197
- last_pair[1] += "; #{text}" ## append with separator
198
- end
199
- last_pair_data_count += 1
200
-
201
- else
202
- puts " **** !!! skipping div w/o category or category_data class:"
203
- puts child.to_s
204
- end
205
- end
206
-
207
- ## pp pairs
208
-
209
- ## pairs to hash
210
- pairs_hash = {}
211
- pairs.each do |pair|
212
- pairs_hash[ pair[0] ] = pair[1]
213
- end
214
-
215
- hash[ last_cat ] = pairs_hash
216
-
217
- else
218
- puts "#### !!!! unknown cell type (no field or data id found):"
219
- puts cell.to_s
220
- end
221
- end # each cell
222
-
223
- pp hash
224
- end # method stats
225
-
226
-
227
36
  end # class TestPage
@@ -6,7 +6,195 @@ require 'helper'
6
6
 
7
7
  class TestPageOld < MiniTest::Unit::TestCase
8
8
 
9
- def xxx_test_mx
9
+
10
+ def xxx_test_br
11
+ Dir.mkdir( 'tmp' ) unless Dir.exists?( 'tmp' )
12
+
13
+ page = Factbook::Page.new( 'br' )
14
+
15
+ ## print first 600 chars
16
+ pp page.html[0..600]
17
+
18
+ ## save for debuging
19
+
20
+ Dir.mkdir( 'tmp' ) unless Dir.exists?( 'tmp' )
21
+ puts "saving a copy to br.html for debugging"
22
+ File.open( 'tmp/br.html', 'w') do |f|
23
+ f.write( page.html )
24
+ end
25
+
26
+ doc = page.doc
27
+ sects = page.sects
28
+
29
+ rows = doc.css( 'table tr' )
30
+ field_ids = rows.css( '#field' ) ## check - use div#field.category -- possible?
31
+ data_ids = rows.css( '#data' )
32
+
33
+ puts "rows.size: #{rows.size} (field_ids.size: #{field_ids.size} / data_ids.size: #{data_ids.size})"
34
+
35
+ cats = rows.css( '.category' )
36
+ cats_div = rows.css( 'div.category' )
37
+ cats_span = rows.css( 'span.category' )
38
+ cats_other_size = cats.size - cats_div.size - cats_span.size
39
+
40
+ cats_data = rows.css( '.category_data' )
41
+ cats_div_data = rows.css( 'div.category_data' )
42
+ cats_span_data = rows.css( 'span.category_data' )
43
+ cats_other_data_size = cats_data.size - cats_div_data.size - cats_span_data.size
44
+
45
+ puts "cats.size: #{cats.size} (cats_div.size #{cats_div.size} / cats_span.size #{cats_span.size} / cats_other.size #{cats_other_size})"
46
+ puts "cats_data.size: #{cats_data.size} (cats_div_data.size #{cats_div_data.size} / cats_span_data.size #{cats_span_data.size} / cats_other_data.size #{cats_other_data_size})"
47
+
48
+ ## some check for structure
49
+ if cats_other_size > 0
50
+ puts " ****!!!! category other (not div/span) found - #{cats_other_size}"
51
+ end
52
+
53
+ if cats_other_data_size > 0
54
+ puts " ****!!!! category_data other (not div/span) found - #{cats_other_data_size}"
55
+ end
56
+
57
+ ## stats( doc )
58
+
59
+ sects.each_with_index do |sect,i|
60
+ puts ''
61
+ puts "############################"
62
+ puts "#### stats sect #{i}:"
63
+ pp page.sect_to_hash( sect )
64
+ end
65
+ end
66
+
67
+
68
+ def xxx_stats( doc )
69
+ rows = doc.css( 'table tr' )
70
+ cells = doc.css( 'table tr td' )
71
+ field_ids = rows.css( '#field' ) ## check - use div#field.category -- possible?
72
+ data_ids = rows.css( '#data' )
73
+
74
+ puts "rows.size: #{rows.size} (cells.size: #{cells.size} / field_ids.size: #{field_ids.size} / data_ids.size: #{data_ids.size})"
75
+
76
+ hash = {}
77
+ last_cat = nil
78
+
79
+
80
+ cells.each_with_index do |cell,i|
81
+ ## next if i > 14 ## skip after xx for debugging for now
82
+
83
+ # check if field or data id
84
+
85
+ # check for (nested) div#field in td
86
+ has_field_id = cell.css( '#field' ).size == 1 ? true : false
87
+
88
+ # check for td#data
89
+ has_data_id = cell['id'] == 'data' ? true : false
90
+
91
+ if has_field_id
92
+
93
+ cats = cell.css( 'div.category' ) ## note: ignore all .category not using div (issue warn/err if found!!) etc.
94
+ if cats.size == 1
95
+ text = cats.first.text.strip # remove/strip leading and trailing spaces
96
+ last_cat = text
97
+ puts " [#{i}] category: >>#{text}<<"
98
+ else
99
+ puts "**** !!!!!! warn/err - found element w/ field id (no match for subsection!!! - check)"
100
+ puts cell.to_s
101
+ end
102
+
103
+ elsif has_data_id
104
+
105
+ cats = cell.css( 'div.category' ) ## note: ignore all .category not using div (issue warn/err if found!!) etc.
106
+ cats_data = cell.css( 'div.category_data,span.category_data' ) ## note: ignore a.category_data etc.
107
+ cats_div_data = cell.css( 'div.category_data' )
108
+ cats_span_data = cell.css( 'span.category_data' )
109
+
110
+ puts " - [#{i}] data cell - cats: #{cats.size}, cats_data: #{cats_data.size} (cats_div_data: #{cats_div_data.size} / cats_span_data: #{cats_span_data.size})"
111
+
112
+ pairs = []
113
+ last_pair = nil
114
+ last_pair_data_count = 0
115
+
116
+ ## loop over div blocks (might be .category or .category_data)
117
+ cell.children.each_with_index do |child,j|
118
+ unless child.element?
119
+ ## puts " **** !!!! skipping non-element type >#{child.type}<:"
120
+ ## puts child.to_s
121
+ next
122
+ end
123
+ unless child.name == 'div'
124
+ puts " **** !!! skipping non-div >#{child.name}<:"
125
+ puts child.to_s
126
+ next
127
+ end
128
+
129
+ ### check if .category or .category_data
130
+ if child['class'] == 'category'
131
+
132
+ ## collect text for category; exclude element w/ class.category_data
133
+ text = ""
134
+ child.children.each do |subchild|
135
+ text << subchild.text.strip unless subchild.element? && subchild['class'] == 'category_data'
136
+ end
137
+
138
+ value = child.css('span.category_data').text.strip
139
+
140
+ puts " -- category >>#{text}<<"
141
+
142
+ ## start new pair
143
+ last_pair = [ text, value ]
144
+ last_pair_data_count = 0
145
+ pairs << last_pair
146
+
147
+ elsif child['class'] == 'category_data'
148
+ puts " -- category_data"
149
+
150
+ text = child.text.strip
151
+
152
+ if last_pair.nil?
153
+ ## assume its the very first entry; use implied/auto-created category
154
+ last_pair = [ 'text', '' ]
155
+ last_pair_data_count = 0
156
+ pairs << last_pair
157
+ end
158
+
159
+ ### first category_data element?
160
+ if last_pair_data_count == 0
161
+ if last_pair[1] == ''
162
+ last_pair[1] = text
163
+ else
164
+ last_pair[1] += " #{text}" ## append w/o separator
165
+ end
166
+ else
167
+ last_pair[1] += "; #{text}" ## append with separator
168
+ end
169
+ last_pair_data_count += 1
170
+
171
+ else
172
+ puts " **** !!! skipping div w/o category or category_data class:"
173
+ puts child.to_s
174
+ end
175
+ end
176
+
177
+ ## pp pairs
178
+
179
+ ## pairs to hash
180
+ pairs_hash = {}
181
+ pairs.each do |pair|
182
+ pairs_hash[ pair[0] ] = pair[1]
183
+ end
184
+
185
+ hash[ last_cat ] = pairs_hash
186
+
187
+ else
188
+ puts "#### !!!! unknown cell type (no field or data id found):"
189
+ puts cell.to_s
190
+ end
191
+ end # each cell
192
+
193
+ pp hash
194
+ end # method stats
195
+
196
+
197
+ def yyy_test_mx
10
198
  page = Factbook::Page.new( 'mx' )
11
199
 
12
200
  ## print first 600 chars
@@ -74,7 +262,7 @@ class TestPageOld < MiniTest::Unit::TestCase
74
262
 
75
263
  end
76
264
 
77
- def xxxx_test_mx
265
+ def yyy_test_mx
78
266
  page = Factbook::Page.new( 'mx' )
79
267
 
80
268
  ## print first 600 chars
@@ -147,7 +335,7 @@ class TestPageOld < MiniTest::Unit::TestCase
147
335
  end
148
336
 
149
337
 
150
- def stats( doc )
338
+ def yyy_stats( doc )
151
339
  rows = doc.css( 'table tr' )
152
340
  cells = doc.css( 'table tr td' )
153
341
  field_ids = rows.css( '#field' ) ## check - use div#field.category -- possible?
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: factbook
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.1.2
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2014-07-12 00:00:00.000000000 Z
12
+ date: 2014-07-13 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: logutils
16
- requirement: &73732120 !ruby/object:Gem::Requirement
16
+ requirement: &82513280 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: '0'
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *73732120
24
+ version_requirements: *82513280
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: fetcher
27
- requirement: &73731730 !ruby/object:Gem::Requirement
27
+ requirement: &82544000 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ! '>='
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: '0'
33
33
  type: :runtime
34
34
  prerelease: false
35
- version_requirements: *73731730
35
+ version_requirements: *82544000
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: nokogiri
38
- requirement: &73731340 !ruby/object:Gem::Requirement
38
+ requirement: &82543300 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ! '>='
@@ -43,10 +43,10 @@ dependencies:
43
43
  version: '0'
44
44
  type: :runtime
45
45
  prerelease: false
46
- version_requirements: *73731340
46
+ version_requirements: *82543300
47
47
  - !ruby/object:Gem::Dependency
48
48
  name: rdoc
49
- requirement: &73730940 !ruby/object:Gem::Requirement
49
+ requirement: &82542590 !ruby/object:Gem::Requirement
50
50
  none: false
51
51
  requirements:
52
52
  - - ~>
@@ -54,10 +54,10 @@ dependencies:
54
54
  version: '4.0'
55
55
  type: :development
56
56
  prerelease: false
57
- version_requirements: *73730940
57
+ version_requirements: *82542590
58
58
  - !ruby/object:Gem::Dependency
59
59
  name: hoe
60
- requirement: &73730530 !ruby/object:Gem::Requirement
60
+ requirement: &82541780 !ruby/object:Gem::Requirement
61
61
  none: false
62
62
  requirements:
63
63
  - - ~>
@@ -65,7 +65,7 @@ dependencies:
65
65
  version: '3.11'
66
66
  type: :development
67
67
  prerelease: false
68
- version_requirements: *73730530
68
+ version_requirements: *82541780
69
69
  description: factbook - scripts for the world factbook (get open structured data e.g
70
70
  JSON etc.)
71
71
  email: openmundi@googlegroups.com
@@ -82,11 +82,16 @@ files:
82
82
  - Rakefile
83
83
  - lib/factbook.rb
84
84
  - lib/factbook/page.rb
85
+ - lib/factbook/sect.rb
85
86
  - lib/factbook/version.rb
86
87
  - test/data/countrytemplate_au.html
87
88
  - test/data/countrytemplate_be.html
88
89
  - test/data/countrytemplate_br.html
90
+ - test/data/countrytemplate_ee.html
91
+ - test/data/countrytemplate_ls.html
89
92
  - test/data/countrytemplate_mx.html
93
+ - test/data/countrytemplate_vt.html
94
+ - test/data/countrytemplate_xx.html
90
95
  - test/helper.rb
91
96
  - test/test_json.rb
92
97
  - test/test_page.rb