factbook 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/test/test_json.rb CHANGED
@@ -6,40 +6,42 @@ require 'helper'
6
6
 
7
7
  class TestJson < MiniTest::Unit::TestCase
8
8
 
9
- def setup
10
- Dir.mkdir( 'tmp' ) unless Dir.exists?( 'tmp' )
11
- end
12
9
 
13
10
  def test_json
14
- gen_json_for( 'au')
15
- gen_json_for( 'be')
16
- gen_json_for( 'br')
17
- gen_json_for( 'mx')
18
- end
19
-
20
- def gen_json_for( code )
21
- page = Factbook::Page.new( code )
22
- page.html = File.read( "#{Factbook.root}/test/data/countrytemplate_#{code}.html" )
23
-
24
- ## print first 600 chars
25
- pp page.html[0..600]
26
-
27
- ## save for debuging
11
+ Dir.mkdir( 'tmp' ) unless Dir.exists?( 'tmp' )
12
+
13
+ codes = [ 'au',
14
+ 'be',
15
+ 'br',
16
+ 'mx',
17
+ 'ls',
18
+ 'vt',
19
+ 'ee',
20
+ 'xx' ]
21
+
22
+ codes.each do |code|
23
+ page = Factbook::Page.new( code )
24
+ page.html = File.read( "#{Factbook.root}/test/data/countrytemplate_#{code}.html" )
25
+
26
+ ## print first 600 chars
27
+ pp page.html[0..600]
28
+
29
+ ## save for debuging
28
30
 
29
- puts "saving a copy to #{code}.html for debugging"
30
- File.open( "tmp/#{code}.html", 'w') do |f|
31
- f.write( page.html )
32
- end
31
+ puts "saving a copy to #{code}.html for debugging"
32
+ File.open( "tmp/#{code}.html", 'w' ) do |f|
33
+ f.write page.html
34
+ end
33
35
 
34
- h = page.data
35
- pp h
36
+ h = page.data
37
+ pp h
36
38
 
37
- ### save to json
38
- puts "saving a copy to #{code}.json for debugging"
39
- File.open( "tmp/#{code}.json", 'w') do |f|
40
- f.write( JSON.pretty_generate( h ) )
41
- end
39
+ ### save to json
40
+ puts "saving a copy to #{code}.json for debugging"
41
+ File.open( "tmp/#{code}.json", 'w' ) do |f|
42
+ f.write JSON.pretty_generate( h )
43
+ end
44
+ end
42
45
  end
43
46
 
44
-
45
47
  end # class TestJson
data/test/test_page.rb CHANGED
@@ -6,222 +6,31 @@ require 'helper'
6
6
 
7
7
  class TestPage < MiniTest::Unit::TestCase
8
8
 
9
- def setup
10
- Dir.mkdir( 'tmp' ) unless Dir.exists?( 'tmp' )
11
- end
12
9
 
13
- def test_br
14
- page = Factbook::Page.new( 'br' )
10
+ def test_sects
11
+ pages = [
12
+ [ 'au', 10 ],
13
+ [ 'be', 10 ],
14
+ [ 'br', 10 ],
15
+ [ 'ee', 10 ],
16
+ [ 'mx', 10 ],
17
+ [ 'xx', 10 ],
18
+ [ 'ls', 9 ],
19
+ [ 'vt', 8 ]]
15
20
 
16
- page.html = File.read( "#{Factbook.root}/test/data/countrytemplate_br.html" )
21
+ pages.each do |rec|
22
+ code = rec[0]
23
+ sects_size = rec[1]
17
24
 
18
- ## print first 600 chars
19
- pp page.html[0..600]
20
-
21
- ## save for debuging
22
-
23
- puts "saving a copy to br.html for debugging"
24
- File.open( 'tmp/br.html', 'w') do |f|
25
- f.write( page.html )
26
- end
25
+ page = Factbook::Page.new( code )
26
+ page.html = File.read( "#{Factbook.root}/test/data/countrytemplate_#{code}.html" )
27
27
 
28
- doc = page.doc
29
- sects = page.sects
28
+ ## print first 600 chars
29
+ pp page.html[0..600]
30
30
 
31
- h = page.data
32
- pp h
33
-
34
- ### save to json
35
- puts "saving a copy to br.json for debugging"
36
- File.open( 'tmp/br.json', 'w') do |f|
37
- f.write( JSON.pretty_generate( h ) )
31
+ assert_equal sects_size, page.sects.size
38
32
  end
39
33
  end
40
34
 
41
35
 
42
- def xxx_test_br
43
- page = Factbook::Page.new( 'br' )
44
-
45
- ## print first 600 chars
46
- pp page.html[0..600]
47
-
48
- ## save for debuging
49
-
50
- Dir.mkdir( 'tmp' ) unless Dir.exists?( 'tmp' )
51
- puts "saving a copy to br.html for debugging"
52
- File.open( 'tmp/br.html', 'w') do |f|
53
- f.write( page.html )
54
- end
55
-
56
- doc = page.doc
57
- sects = page.sects
58
-
59
- rows = doc.css( 'table tr' )
60
- field_ids = rows.css( '#field' ) ## check - use div#field.category -- possible?
61
- data_ids = rows.css( '#data' )
62
-
63
- puts "rows.size: #{rows.size} (field_ids.size: #{field_ids.size} / data_ids.size: #{data_ids.size})"
64
-
65
- cats = rows.css( '.category' )
66
- cats_div = rows.css( 'div.category' )
67
- cats_span = rows.css( 'span.category' )
68
- cats_other_size = cats.size - cats_div.size - cats_span.size
69
-
70
- cats_data = rows.css( '.category_data' )
71
- cats_div_data = rows.css( 'div.category_data' )
72
- cats_span_data = rows.css( 'span.category_data' )
73
- cats_other_data_size = cats_data.size - cats_div_data.size - cats_span_data.size
74
-
75
- puts "cats.size: #{cats.size} (cats_div.size #{cats_div.size} / cats_span.size #{cats_span.size} / cats_other.size #{cats_other_size})"
76
- puts "cats_data.size: #{cats_data.size} (cats_div_data.size #{cats_div_data.size} / cats_span_data.size #{cats_span_data.size} / cats_other_data.size #{cats_other_data_size})"
77
-
78
- ## some check for structure
79
- if cats_other_size > 0
80
- puts " ****!!!! category other (not div/span) found - #{cats_other_size}"
81
- end
82
-
83
- if cats_other_data_size > 0
84
- puts " ****!!!! category_data other (not div/span) found - #{cats_other_data_size}"
85
- end
86
-
87
- ## stats( doc )
88
-
89
- sects.each_with_index do |sect,i|
90
- puts ''
91
- puts "############################"
92
- puts "#### stats sect #{i}:"
93
- pp page.sect_to_hash( sect )
94
- end
95
- end
96
-
97
-
98
- def stats( doc )
99
- rows = doc.css( 'table tr' )
100
- cells = doc.css( 'table tr td' )
101
- field_ids = rows.css( '#field' ) ## check - use div#field.category -- possible?
102
- data_ids = rows.css( '#data' )
103
-
104
- puts "rows.size: #{rows.size} (cells.size: #{cells.size} / field_ids.size: #{field_ids.size} / data_ids.size: #{data_ids.size})"
105
-
106
- hash = {}
107
- last_cat = nil
108
-
109
-
110
- cells.each_with_index do |cell,i|
111
- ## next if i > 14 ## skip after xx for debugging for now
112
-
113
- # check if field or data id
114
-
115
- # check for (nested) div#field in td
116
- has_field_id = cell.css( '#field' ).size == 1 ? true : false
117
-
118
- # check for td#data
119
- has_data_id = cell['id'] == 'data' ? true : false
120
-
121
- if has_field_id
122
-
123
- cats = cell.css( 'div.category' ) ## note: ignore all .category not using div (issue warn/err if found!!) etc.
124
- if cats.size == 1
125
- text = cats.first.text.strip # remove/strip leading and trailing spaces
126
- last_cat = text
127
- puts " [#{i}] category: >>#{text}<<"
128
- else
129
- puts "**** !!!!!! warn/err - found element w/ field id (no match for subsection!!! - check)"
130
- puts cell.to_s
131
- end
132
-
133
- elsif has_data_id
134
-
135
- cats = cell.css( 'div.category' ) ## note: ignore all .category not using div (issue warn/err if found!!) etc.
136
- cats_data = cell.css( 'div.category_data,span.category_data' ) ## note: ignore a.category_data etc.
137
- cats_div_data = cell.css( 'div.category_data' )
138
- cats_span_data = cell.css( 'span.category_data' )
139
-
140
- puts " - [#{i}] data cell - cats: #{cats.size}, cats_data: #{cats_data.size} (cats_div_data: #{cats_div_data.size} / cats_span_data: #{cats_span_data.size})"
141
-
142
- pairs = []
143
- last_pair = nil
144
- last_pair_data_count = 0
145
-
146
- ## loop over div blocks (might be .category or .category_data)
147
- cell.children.each_with_index do |child,j|
148
- unless child.element?
149
- ## puts " **** !!!! skipping non-element type >#{child.type}<:"
150
- ## puts child.to_s
151
- next
152
- end
153
- unless child.name == 'div'
154
- puts " **** !!! skipping non-div >#{child.name}<:"
155
- puts child.to_s
156
- next
157
- end
158
-
159
- ### check if .category or .category_data
160
- if child['class'] == 'category'
161
-
162
- ## collect text for category; exclude element w/ class.category_data
163
- text = ""
164
- child.children.each do |subchild|
165
- text << subchild.text.strip unless subchild.element? && subchild['class'] == 'category_data'
166
- end
167
-
168
- value = child.css('span.category_data').text.strip
169
-
170
- puts " -- category >>#{text}<<"
171
-
172
- ## start new pair
173
- last_pair = [ text, value ]
174
- last_pair_data_count = 0
175
- pairs << last_pair
176
-
177
- elsif child['class'] == 'category_data'
178
- puts " -- category_data"
179
-
180
- text = child.text.strip
181
-
182
- if last_pair.nil?
183
- ## assume its the very first entry; use implied/auto-created category
184
- last_pair = [ 'text', '' ]
185
- last_pair_data_count = 0
186
- pairs << last_pair
187
- end
188
-
189
- ### first category_data element?
190
- if last_pair_data_count == 0
191
- if last_pair[1] == ''
192
- last_pair[1] = text
193
- else
194
- last_pair[1] += " #{text}" ## append w/o separator
195
- end
196
- else
197
- last_pair[1] += "; #{text}" ## append with separator
198
- end
199
- last_pair_data_count += 1
200
-
201
- else
202
- puts " **** !!! skipping div w/o category or category_data class:"
203
- puts child.to_s
204
- end
205
- end
206
-
207
- ## pp pairs
208
-
209
- ## pairs to hash
210
- pairs_hash = {}
211
- pairs.each do |pair|
212
- pairs_hash[ pair[0] ] = pair[1]
213
- end
214
-
215
- hash[ last_cat ] = pairs_hash
216
-
217
- else
218
- puts "#### !!!! unknown cell type (no field or data id found):"
219
- puts cell.to_s
220
- end
221
- end # each cell
222
-
223
- pp hash
224
- end # method stats
225
-
226
-
227
36
  end # class TestPage
@@ -6,7 +6,195 @@ require 'helper'
6
6
 
7
7
  class TestPageOld < MiniTest::Unit::TestCase
8
8
 
9
- def xxx_test_mx
9
+
10
+ def xxx_test_br
11
+ Dir.mkdir( 'tmp' ) unless Dir.exists?( 'tmp' )
12
+
13
+ page = Factbook::Page.new( 'br' )
14
+
15
+ ## print first 600 chars
16
+ pp page.html[0..600]
17
+
18
+ ## save for debuging
19
+
20
+ Dir.mkdir( 'tmp' ) unless Dir.exists?( 'tmp' )
21
+ puts "saving a copy to br.html for debugging"
22
+ File.open( 'tmp/br.html', 'w') do |f|
23
+ f.write( page.html )
24
+ end
25
+
26
+ doc = page.doc
27
+ sects = page.sects
28
+
29
+ rows = doc.css( 'table tr' )
30
+ field_ids = rows.css( '#field' ) ## check - use div#field.category -- possible?
31
+ data_ids = rows.css( '#data' )
32
+
33
+ puts "rows.size: #{rows.size} (field_ids.size: #{field_ids.size} / data_ids.size: #{data_ids.size})"
34
+
35
+ cats = rows.css( '.category' )
36
+ cats_div = rows.css( 'div.category' )
37
+ cats_span = rows.css( 'span.category' )
38
+ cats_other_size = cats.size - cats_div.size - cats_span.size
39
+
40
+ cats_data = rows.css( '.category_data' )
41
+ cats_div_data = rows.css( 'div.category_data' )
42
+ cats_span_data = rows.css( 'span.category_data' )
43
+ cats_other_data_size = cats_data.size - cats_div_data.size - cats_span_data.size
44
+
45
+ puts "cats.size: #{cats.size} (cats_div.size #{cats_div.size} / cats_span.size #{cats_span.size} / cats_other.size #{cats_other_size})"
46
+ puts "cats_data.size: #{cats_data.size} (cats_div_data.size #{cats_div_data.size} / cats_span_data.size #{cats_span_data.size} / cats_other_data.size #{cats_other_data_size})"
47
+
48
+ ## some check for structure
49
+ if cats_other_size > 0
50
+ puts " ****!!!! category other (not div/span) found - #{cats_other_size}"
51
+ end
52
+
53
+ if cats_other_data_size > 0
54
+ puts " ****!!!! category_data other (not div/span) found - #{cats_other_data_size}"
55
+ end
56
+
57
+ ## stats( doc )
58
+
59
+ sects.each_with_index do |sect,i|
60
+ puts ''
61
+ puts "############################"
62
+ puts "#### stats sect #{i}:"
63
+ pp page.sect_to_hash( sect )
64
+ end
65
+ end
66
+
67
+
68
+ def xxx_stats( doc )
69
+ rows = doc.css( 'table tr' )
70
+ cells = doc.css( 'table tr td' )
71
+ field_ids = rows.css( '#field' ) ## check - use div#field.category -- possible?
72
+ data_ids = rows.css( '#data' )
73
+
74
+ puts "rows.size: #{rows.size} (cells.size: #{cells.size} / field_ids.size: #{field_ids.size} / data_ids.size: #{data_ids.size})"
75
+
76
+ hash = {}
77
+ last_cat = nil
78
+
79
+
80
+ cells.each_with_index do |cell,i|
81
+ ## next if i > 14 ## skip after xx for debugging for now
82
+
83
+ # check if field or data id
84
+
85
+ # check for (nested) div#field in td
86
+ has_field_id = cell.css( '#field' ).size == 1 ? true : false
87
+
88
+ # check for td#data
89
+ has_data_id = cell['id'] == 'data' ? true : false
90
+
91
+ if has_field_id
92
+
93
+ cats = cell.css( 'div.category' ) ## note: ignore all .category not using div (issue warn/err if found!!) etc.
94
+ if cats.size == 1
95
+ text = cats.first.text.strip # remove/strip leading and trailing spaces
96
+ last_cat = text
97
+ puts " [#{i}] category: >>#{text}<<"
98
+ else
99
+ puts "**** !!!!!! warn/err - found element w/ field id (no match for subsection!!! - check)"
100
+ puts cell.to_s
101
+ end
102
+
103
+ elsif has_data_id
104
+
105
+ cats = cell.css( 'div.category' ) ## note: ignore all .category not using div (issue warn/err if found!!) etc.
106
+ cats_data = cell.css( 'div.category_data,span.category_data' ) ## note: ignore a.category_data etc.
107
+ cats_div_data = cell.css( 'div.category_data' )
108
+ cats_span_data = cell.css( 'span.category_data' )
109
+
110
+ puts " - [#{i}] data cell - cats: #{cats.size}, cats_data: #{cats_data.size} (cats_div_data: #{cats_div_data.size} / cats_span_data: #{cats_span_data.size})"
111
+
112
+ pairs = []
113
+ last_pair = nil
114
+ last_pair_data_count = 0
115
+
116
+ ## loop over div blocks (might be .category or .category_data)
117
+ cell.children.each_with_index do |child,j|
118
+ unless child.element?
119
+ ## puts " **** !!!! skipping non-element type >#{child.type}<:"
120
+ ## puts child.to_s
121
+ next
122
+ end
123
+ unless child.name == 'div'
124
+ puts " **** !!! skipping non-div >#{child.name}<:"
125
+ puts child.to_s
126
+ next
127
+ end
128
+
129
+ ### check if .category or .category_data
130
+ if child['class'] == 'category'
131
+
132
+ ## collect text for category; exclude element w/ class.category_data
133
+ text = ""
134
+ child.children.each do |subchild|
135
+ text << subchild.text.strip unless subchild.element? && subchild['class'] == 'category_data'
136
+ end
137
+
138
+ value = child.css('span.category_data').text.strip
139
+
140
+ puts " -- category >>#{text}<<"
141
+
142
+ ## start new pair
143
+ last_pair = [ text, value ]
144
+ last_pair_data_count = 0
145
+ pairs << last_pair
146
+
147
+ elsif child['class'] == 'category_data'
148
+ puts " -- category_data"
149
+
150
+ text = child.text.strip
151
+
152
+ if last_pair.nil?
153
+ ## assume its the very first entry; use implied/auto-created category
154
+ last_pair = [ 'text', '' ]
155
+ last_pair_data_count = 0
156
+ pairs << last_pair
157
+ end
158
+
159
+ ### first category_data element?
160
+ if last_pair_data_count == 0
161
+ if last_pair[1] == ''
162
+ last_pair[1] = text
163
+ else
164
+ last_pair[1] += " #{text}" ## append w/o separator
165
+ end
166
+ else
167
+ last_pair[1] += "; #{text}" ## append with separator
168
+ end
169
+ last_pair_data_count += 1
170
+
171
+ else
172
+ puts " **** !!! skipping div w/o category or category_data class:"
173
+ puts child.to_s
174
+ end
175
+ end
176
+
177
+ ## pp pairs
178
+
179
+ ## pairs to hash
180
+ pairs_hash = {}
181
+ pairs.each do |pair|
182
+ pairs_hash[ pair[0] ] = pair[1]
183
+ end
184
+
185
+ hash[ last_cat ] = pairs_hash
186
+
187
+ else
188
+ puts "#### !!!! unknown cell type (no field or data id found):"
189
+ puts cell.to_s
190
+ end
191
+ end # each cell
192
+
193
+ pp hash
194
+ end # method stats
195
+
196
+
197
+ def yyy_test_mx
10
198
  page = Factbook::Page.new( 'mx' )
11
199
 
12
200
  ## print first 600 chars
@@ -74,7 +262,7 @@ class TestPageOld < MiniTest::Unit::TestCase
74
262
 
75
263
  end
76
264
 
77
- def xxxx_test_mx
265
+ def yyy_test_mx
78
266
  page = Factbook::Page.new( 'mx' )
79
267
 
80
268
  ## print first 600 chars
@@ -147,7 +335,7 @@ class TestPageOld < MiniTest::Unit::TestCase
147
335
  end
148
336
 
149
337
 
150
- def stats( doc )
338
+ def yyy_stats( doc )
151
339
  rows = doc.css( 'table tr' )
152
340
  cells = doc.css( 'table tr td' )
153
341
  field_ids = rows.css( '#field' ) ## check - use div#field.category -- possible?
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: factbook
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.1.2
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2014-07-12 00:00:00.000000000 Z
12
+ date: 2014-07-13 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: logutils
16
- requirement: &73732120 !ruby/object:Gem::Requirement
16
+ requirement: &82513280 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: '0'
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *73732120
24
+ version_requirements: *82513280
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: fetcher
27
- requirement: &73731730 !ruby/object:Gem::Requirement
27
+ requirement: &82544000 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ! '>='
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: '0'
33
33
  type: :runtime
34
34
  prerelease: false
35
- version_requirements: *73731730
35
+ version_requirements: *82544000
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: nokogiri
38
- requirement: &73731340 !ruby/object:Gem::Requirement
38
+ requirement: &82543300 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ! '>='
@@ -43,10 +43,10 @@ dependencies:
43
43
  version: '0'
44
44
  type: :runtime
45
45
  prerelease: false
46
- version_requirements: *73731340
46
+ version_requirements: *82543300
47
47
  - !ruby/object:Gem::Dependency
48
48
  name: rdoc
49
- requirement: &73730940 !ruby/object:Gem::Requirement
49
+ requirement: &82542590 !ruby/object:Gem::Requirement
50
50
  none: false
51
51
  requirements:
52
52
  - - ~>
@@ -54,10 +54,10 @@ dependencies:
54
54
  version: '4.0'
55
55
  type: :development
56
56
  prerelease: false
57
- version_requirements: *73730940
57
+ version_requirements: *82542590
58
58
  - !ruby/object:Gem::Dependency
59
59
  name: hoe
60
- requirement: &73730530 !ruby/object:Gem::Requirement
60
+ requirement: &82541780 !ruby/object:Gem::Requirement
61
61
  none: false
62
62
  requirements:
63
63
  - - ~>
@@ -65,7 +65,7 @@ dependencies:
65
65
  version: '3.11'
66
66
  type: :development
67
67
  prerelease: false
68
- version_requirements: *73730530
68
+ version_requirements: *82541780
69
69
  description: factbook - scripts for the world factbook (get open structured data e.g
70
70
  JSON etc.)
71
71
  email: openmundi@googlegroups.com
@@ -82,11 +82,16 @@ files:
82
82
  - Rakefile
83
83
  - lib/factbook.rb
84
84
  - lib/factbook/page.rb
85
+ - lib/factbook/sect.rb
85
86
  - lib/factbook/version.rb
86
87
  - test/data/countrytemplate_au.html
87
88
  - test/data/countrytemplate_be.html
88
89
  - test/data/countrytemplate_br.html
90
+ - test/data/countrytemplate_ee.html
91
+ - test/data/countrytemplate_ls.html
89
92
  - test/data/countrytemplate_mx.html
93
+ - test/data/countrytemplate_vt.html
94
+ - test/data/countrytemplate_xx.html
90
95
  - test/helper.rb
91
96
  - test/test_json.rb
92
97
  - test/test_page.rb