factbook 0.1.3 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. checksums.yaml +7 -0
  2. data/Manifest.txt +34 -22
  3. data/README.md +8 -3
  4. data/Rakefile +2 -263
  5. data/data/codes.csv +262 -0
  6. data/data/comparisons.csv +75 -0
  7. data/lib/factbook/builder.rb +214 -0
  8. data/lib/factbook/builder_item.rb +93 -0
  9. data/lib/factbook/codes.rb +119 -0
  10. data/lib/factbook/comparisons.rb +50 -0
  11. data/lib/factbook/page.rb +103 -303
  12. data/lib/factbook/sanitizer.rb +214 -0
  13. data/lib/factbook/sect.rb +29 -196
  14. data/lib/factbook/subsect.rb +18 -0
  15. data/lib/factbook/table.rb +52 -0
  16. data/lib/factbook/utils.rb +85 -0
  17. data/lib/factbook/utils_info.rb +102 -0
  18. data/lib/factbook/version.rb +4 -3
  19. data/lib/factbook.rb +23 -1
  20. data/test/data/au.html +579 -0
  21. data/test/data/au.yml +8 -0
  22. data/test/data/be.html +596 -0
  23. data/test/data/be.yml +8 -0
  24. data/test/data/src/au.html +2006 -0
  25. data/test/data/src/be.html +2011 -0
  26. data/test/helper.rb +0 -4
  27. data/test/test_builder.rb +37 -0
  28. data/test/test_codes.rb +76 -0
  29. data/test/test_comparisons.rb +19 -0
  30. data/test/test_fields.rb +21 -18
  31. data/test/test_item_builder.rb +99 -0
  32. data/test/test_json.rb +17 -20
  33. data/test/test_page.rb +18 -10
  34. data/test/test_sanitizer.rb +35 -0
  35. metadata +68 -49
  36. data/.gemtest +0 -0
  37. data/test/data/countrytemplate_au.html +0 -4179
  38. data/test/data/countrytemplate_be.html +0 -4260
  39. data/test/data/countrytemplate_br.html +0 -4366
  40. data/test/data/countrytemplate_ee.html +0 -2999
  41. data/test/data/countrytemplate_ls.html +0 -2728
  42. data/test/data/countrytemplate_mx.html +0 -4397
  43. data/test/data/countrytemplate_vt.html +0 -1726
  44. data/test/data/countrytemplate_xx.html +0 -2898
  45. data/test/test_page_old.rb +0 -478
  46. data/test/test_strip.rb +0 -66
data/lib/factbook/page.rb CHANGED
@@ -1,303 +1,103 @@
1
- # encoding: utf-8
2
-
3
- module Factbook
4
-
5
-
6
- class Page
7
- include LogUtils::Logging
8
-
9
- ## standard version
10
- ## SITE_BASE = 'https://www.cia.gov/library/publications/the-world-factbook/geos/{code}.html'
11
-
12
- ## -- use text (low-bandwidth) version
13
- ## e.g. www.cia.gov/library/publications/the-world-factbook/geos/countrytemplate_br.html
14
- SITE_BASE = 'https://www.cia.gov/library/publications/the-world-factbook/geos/countrytemplate_{code}.html'
15
-
16
- def initialize( code, opts={} )
17
- ## note: requires factbook country code
18
- # e.g. austria is au
19
- # germany is gm and so on
20
- @code = code
21
-
22
- ### rename fields to format option?? why? why not? e.g. :format => 'long' ??
23
- @opts = opts # fields: full|long|keep|std|?? -- find a good name for the option keeping field names as is
24
-
25
- @html = nil
26
- @doc = nil
27
- @sects = nil
28
- @data = nil
29
- end
30
-
31
- def doc
32
- @doc ||= Nokogiri::HTML( html )
33
- end
34
-
35
- def to_json( opts={} )
36
- ## convenience helper for data.to_json
37
- if opts[:pretty] || opts[:pp]
38
- JSON.pretty_generate( data )
39
- else
40
- data.to_json
41
- end
42
- end
43
-
44
-
45
- def [](key) ### convenience shortcut
46
- # lets you use
47
- # page['geo']
48
- # instead of
49
- # page.data['geo']
50
-
51
- ## fix: use delegate data, [] from forwardable lib - why?? why not??
52
-
53
- data[key]
54
- end
55
-
56
-
57
- def data
58
- if @data.nil?
59
- @data = {}
60
-
61
- if @opts[:header] ## include (leading) header section ??
62
-
63
- header_key = @opts[:fields] ? 'Header' : 'header'
64
- last_built_key = @opts[:fields] ? 'last built' : 'last_built'
65
-
66
- @data[header_key] = {
67
- 'code' => @code,
68
- 'generator' => "factbook/#{VERSION}",
69
- last_built_key => "#{Time.now}",
70
- }
71
- end
72
-
73
- sects.each_with_index do |sect,i|
74
- logger.debug "############################"
75
- logger.debug "### [#{i}] stats sect >#{sect.title}<: "
76
-
77
- @data[ sect.title ] = sect.data
78
- end
79
- end
80
- @data
81
- end
82
-
83
-
84
- def sects
85
- if @sects.nil?
86
- ## split html into sections
87
- ## lets us avoids errors w/ (wrongly) nested tags
88
-
89
- ## check opts for using long or short category/field names
90
- divs = [
91
- [ @opts[:fields] ? 'Introduction' : 'intro', '<div id="CollapsiblePanel1_Intro"' ],
92
- [ @opts[:fields] ? 'Geography' : 'geo', '<div id="CollapsiblePanel1_Geo"' ],
93
- [ @opts[:fields] ? 'People and Society' : 'people', '<div id="CollapsiblePanel1_People"' ],
94
- [ @opts[:fields] ? 'Government' : 'govt', '<div id="CollapsiblePanel1_Govt"' ],
95
- [ @opts[:fields] ? 'Economy' : 'econ', '<div id="CollapsiblePanel1_Econ"' ],
96
- [ @opts[:fields] ? 'Energy' : 'energy', '<div id="CollapsiblePanel1_Energy"' ],
97
- [ @opts[:fields] ? 'Communications' : 'comm', '<div id="CollapsiblePanel1_Comm"' ],
98
- [ @opts[:fields] ? 'Transportation' : 'trans', '<div id="CollapsiblePanel1_Trans"' ],
99
- [ @opts[:fields] ? 'Military' : 'military', '<div id="CollapsiblePanel1_Military"'],
100
- [ @opts[:fields] ? 'Transnational Issues': 'issues', '<div id="CollapsiblePanel1_Issues"' ]
101
- ]
102
-
103
- indexes = []
104
-
105
- ## note:
106
- ## skip missing sections (w/ warning)
107
- ## e.g. Vatican (Holy See), Liechtenstein etc. have no Energy section, for example
108
-
109
- divs.each_with_index do |rec,i|
110
- title = rec[0]
111
- div = rec[1]
112
- p = html.index( div )
113
- if p.nil?
114
- ## issue warning: if not found
115
- logger.warn "***!!! section not found -- #{div} --; skipping"
116
- else
117
- logger.debug " found section #{i} @ #{p}"
118
- indexes << [title,p]
119
- end
120
- end
121
-
122
- @sects = []
123
-
124
- indexes.each_with_index do |rec,i|
125
- title = rec[0]
126
- from = rec[1]
127
-
128
- # is last entry? if yes use -1 otherewise pos
129
- # note: subtract one (-1) from pos unless end-of-string (-1)
130
- to = indexes[i+1].nil? ? -1 : indexes[i+1][1]-1
131
-
132
- ## todo: check that from is smaller than to
133
- logger.debug " cut section #{i} [#{from}..#{to}]"
134
- @sects << Sect.new( title, html[ from..to ], @opts )
135
-
136
- ##if i==0 || i==1
137
- ## puts "debug sect #{i}:"
138
- ## puts ">>>|||#{html[ from..to ]}|||<<<"
139
- ##end
140
- end
141
- end
142
-
143
- @sects
144
- end
145
-
146
- def html=(html)
147
- ## for debugging n testing
148
- ## lets you set html (no need to fetch via net)
149
- @html = html
150
- end
151
-
152
- def html
153
- if @html.nil?
154
- @html = fetch()
155
-
156
- ### remove everything up to
157
- ## <div id="countryInfo" style="display: none;">
158
- ## remove everything starting w/ footer
159
- ## remove head !!!
160
- ## in body remove header n footer
161
-
162
- ## remove inline script
163
- @html = @html.gsub( /<script[^>]*>.*?<\/script>/m ) do |m|
164
- puts "remove script:"
165
- puts "#{m}"
166
- ''
167
- end
168
-
169
- ## remove inline style
170
- @html = @html.gsub( /<style[^>]*>.*?<\/style>/m ) do |m|
171
- puts "remove style:"
172
- puts "#{m}"
173
- ''
174
- end
175
-
176
- ## remove link
177
- link_regex = /<link[^>]+>/
178
- @html = @html.gsub( link_regex ) do |m|
179
- puts "remove link:"
180
- puts "#{m}"
181
- ''
182
- end
183
-
184
- div_country_info_regex = /<div id="countryInfo"\s*>/
185
- ## remove everything before <div id="countryInfo" >
186
- pos = @html.index( div_country_info_regex )
187
- if pos # not nil, false
188
- @html = @html[pos..-1]
189
- end
190
-
191
- ## remove country comparison
192
- ## e.g. <span class="category" >country comparison to the world:</span>
193
- ## <span class="category_data">
194
- ## <a href="../rankorder/2147rank.html?countryname=Brazil&countrycode=br&regionCode=soa&rank=5#br" onMouseDown="" title="Country comparison to the world" alt="Country comparison to the world">
195
- ## 5
196
- ## </a>
197
- ## </span>
198
-
199
- ##
200
- ##
201
- ## <span class="category" style="padding-left:7px;">country comparison to the world:</span> <span class="category_data">
202
- ## <a href="../rankorder/2147rank.html?countryname=Brazil&countrycode=br&regionCode=soa&rank=5#br" onMouseDown="" title="Country comparison to the world" alt="Country comparison to the world"> 5 </a> </span>
203
- ##
204
-
205
- country_comparison_regex = /
206
- <span \s class="category"[^>]*>
207
- country \s comparison \s to \s the \s world:
208
- <\/span>
209
- \s*
210
- <span \s class="category_data"[^>]*>
211
- \s*
212
- <a \s [^>]+>
213
- .+?
214
- <\/a>
215
- \s*
216
- <\/span>
217
- /xm
218
-
219
- @html = @html.gsub( country_comparison_regex ) do |m|
220
- puts "remove country comparison:"
221
- puts "#{m}"
222
- ''
223
- end
224
-
225
- style_attr_regex = /\s*style="[^"]+"/
226
- @html = @html.gsub( style_attr_regex ) do |m|
227
- puts "remove style attr:"
228
- puts "#{m}"
229
- ''
230
- end
231
-
232
- ## <tr height="22">
233
- ## <td class="category_data"></td>
234
- ## </tr>
235
- tr_empty_regex = /
236
- <tr[^>]*>
237
- \s*
238
- <td[^>]*> \s* <\/td>
239
- \s*
240
- <\/tr>
241
- /xm
242
- @html = @html.gsub( tr_empty_regex ) do |m|
243
- puts "remove tr emtpy:"
244
- puts "#{m}"
245
- ''
246
- end
247
-
248
- ## remove world leader website promo
249
- ## <span class="category">(For more information visit the
250
- ## <a href="/library/publications/world-leaders-1/index.html" target="_blank">World Leaders website</a>&nbsp;
251
- ## <img src="../graphics/soa_newwindow.gif" alt="Opens in New Window" title="Opens in New Window" border="0"/>)
252
- ## </span>
253
- world_leaders_website_regex = /
254
- <span \s class="category"[^>]*>
255
- \(
256
- For \s more \s information \s
257
- .+? ## non-greedy (smallest possible match
258
- \)
259
- <\/span>
260
- /xm
261
- @html = @html.gsub( world_leaders_website_regex ) do |m|
262
- puts "remove world leader website promo:"
263
- puts "#{m}"
264
- ''
265
- end
266
-
267
- end
268
- @html
269
- end
270
-
271
- private
272
- def fetch
273
- uri_string = SITE_BASE.gsub( '{code}', @code )
274
-
275
- worker = Fetcher::Worker.new
276
- response = worker.get_response( uri_string )
277
-
278
- if response.code == '200'
279
- t = response.body
280
- ###
281
- # NB: Net::HTTP will NOT set encoding UTF-8 etc.
282
- # will mostly be ASCII
283
- # - try to change encoding to UTF-8 ourselves
284
- logger.debug "t.encoding.name (before): #{t.encoding.name}"
285
- #####
286
- # NB: ASCII-8BIT == BINARY == Encoding Unknown; Raw Bytes Here
287
-
288
- ## NB:
289
- # for now "hardcoded" to utf8 - what else can we do?
290
- # - note: force_encoding will NOT change the chars only change the assumed encoding w/o translation
291
- t = t.force_encoding( Encoding::UTF_8 )
292
- logger.debug "t.encoding.name (after): #{t.encoding.name}"
293
- ## pp t
294
- t
295
- else
296
- logger.error "fetch HTTP - #{response.code} #{response.message}"
297
- nil
298
- end
299
- end
300
-
301
- end # class Page
302
-
303
- end # module Factbook
1
+ # encoding: utf-8
2
+
3
+ module Factbook
4
+
5
+
6
+ ## note:
7
+ ## some factbook pages with chrome (headers, footers, etc.)
8
+ ## are NOT valid utf-8, thus,
9
+ ## treat page as is (e.g. ASCII8BIT)
10
+ #
11
+ # only convert to utf8 when header and footer got stripped
12
+
13
+ ##
14
+ ## be/benin:
15
+ ## Key Force or FC [Lazare S?xx?HOU?xx?TO] -- two invalid byte code chars in Political parties and leaders:
16
+ #
17
+ ## in Western/Windows-1252 leads to FC [Lazare SÈHOUÉTO];
18
+ # Lazare Sèhouéto
19
+ #
20
+ # looks good - use (assume) Windows-1252 ????
21
+
22
+ ##
23
+ # check for is ascii 7-bit ??? if yes -noworries
24
+ # if not, log number of chars not using ascii 7-bit
25
+
26
+
27
+
28
+ class Page
29
+ include LogUtils::Logging
30
+
31
+ attr_accessor :sects
32
+
33
+ def initialize
34
+ @sects = []
35
+ end
36
+
37
+ def [](key) ### convenience shortcut
38
+ # lets you use
39
+ # page['geo']
40
+ # instead of
41
+ # page.data['geo']
42
+
43
+ ## fix: use delegate data, [] from forwardable lib - why?? why not??
44
+
45
+ data[key]
46
+ end
47
+
48
+
49
+ def data
50
+ ## note: cache data hash on first build for now
51
+ if @data.nil?
52
+ ## convert sects to hash
53
+ @data = {}
54
+
55
+ sects.each_with_index do |sect,i|
56
+ @data[ sect.title ] = sect.data
57
+ end
58
+ end
59
+ @data
60
+ end
61
+
62
+
63
+ =begin
64
+ def self.from_url( cc, cn )
65
+ html_ascii = PageFetcher.new.fetch( cc )
66
+ self.new( cc, cn, html_ascii )
67
+ end
68
+
69
+ def self.from_file( cc, cn, opts={} )
70
+ input_dir = opts[:input_dir] || '.'
71
+ html_ascii = File.read( "#{input_dir}/#{cc}.html" ) ## fix/todo: use ASCII8BIT/binary reader
72
+ self.new( cc, cn, html_ascii )
73
+ end
74
+ =end
75
+
76
+
77
+ end # class Page
78
+
79
+
80
+ =begin
81
+ class PageFetcher
82
+
83
+ def fetch( cc )
84
+ worker = Fetcher::Worker.new
85
+ factbook_base = 'https://www.cia.gov/library/publications/the-world-factbook/geos'
86
+
87
+ res = worker.get_response( "#{factbook_base}/#{cc}.html" )
88
+
89
+ # on error throw exception - why? why not??
90
+ if res.code != '200'
91
+ raise Fetcher::HttpError.new( res.code, res.message )
92
+ end
93
+
94
+ ###
95
+ # Note: Net::HTTP will NOT set encoding UTF-8 etc.
96
+ # will be set to ASCII-8BIT == BINARY == Encoding Unknown; Raw Bytes Here
97
+ html = res.body.to_s
98
+ end
99
+ end # PageFetcher
100
+ =end
101
+
102
+
103
+ end # module Factbook
@@ -0,0 +1,214 @@
1
+ # encoding: utf-8
2
+
3
+ module Factbook
4
+
5
+ PageInfo = Struct.new( :country_code,
6
+ :country_name,
7
+ :country_affiliation,
8
+ :region_code,
9
+ :region_name,
10
+ :last_updated )
11
+
12
+ class Sanitizer
13
+ include LogUtils::Logging
14
+ include Utils ## pulls in encode_utf8, ...
15
+
16
+
17
+ def sanitize( html_ascii )
18
+ ## todo: add option for (html source) encoding - why?? why not??
19
+
20
+ ## note:
21
+ ## returns 1) html profile withouth headers, footers, scripts,etc.
22
+ ## 2) page (meta) info e.g. country_name, country_code, last_updated, etc.
23
+ ## 3) errors e.g. list of errors e.g. endcoding errors (invalid byte sequence etc.)
24
+
25
+ page_info = PageInfo.new
26
+
27
+ h = find_page_info( html_ascii )
28
+ page_info.country_code = h[:country_code]
29
+ page_info.country_name = h[:country_name]
30
+ page_info.country_affiliation = h[:country_affiliation]
31
+ page_info.region_code = h[:region_code]
32
+ page_info.region_name = h[:region_name]
33
+
34
+ page_info.last_updated = find_page_last_updated( html_ascii )
35
+
36
+
37
+ html_profile_ascii = find_country_profile( html_ascii ) ## cut-off headers, footers, scripts, etc.
38
+
39
+ ## todo/fix: assume windows 12xx encoding!!!! for factbook - try
40
+ html, errors = encode_utf8( html_profile_ascii ) ## change encoding to utf-8 (from binary/ascii8bit)
41
+
42
+ html = sanitize_profile( html )
43
+
44
+ [html, page_info, errors]
45
+ end
46
+
47
+
48
+
49
+ BEGIN_FACTS_REGEX = /<ul\s+
50
+ class="expandcollapse">
51
+ /xim ## ignore case; multi-line
52
+
53
+ END_FACTS_REGEX = /<\/li>\s*
54
+ <\/ul>\s*
55
+ <\/tbody>\s*
56
+ <\/table>
57
+ /xim ## ignore case; multi-line
58
+
59
+
60
+ def find_country_profile( html )
61
+ ####
62
+ ## remove header (everything before)
63
+ ## <ul class="expandcollapse">
64
+
65
+ pos = html.index( BEGIN_FACTS_REGEX )
66
+ fail "*** no begin facts marker found for page" if pos.nil?
67
+
68
+ puts " bingo - found BEGIN_FACTS on pos #{pos}"
69
+ html = html[pos..-1]
70
+
71
+ pp html[0..100]
72
+
73
+ ###
74
+ ## remove footer
75
+ ## assume everthings after (last list item in unorder list inside a table body)
76
+ ## </li>
77
+ ## </ul>
78
+ ## </tbody></table>
79
+
80
+ pos = html.index( END_FACTS_REGEX )
81
+ fail "*** no end facts marker found for page" if pos.nil?
82
+
83
+ puts " bingo - found END_FACTS on pos #{pos}"
84
+ html = html[0...pos] + "</li></ul>\n" ## note: use ... (not .. to cut-off pos)
85
+
86
+ pp html[-200..-1]
87
+ html
88
+ end
89
+
90
+
91
+
92
+ STYLE_ATTR_REGEX = /\s*
93
+ style=('|").+?\1 ## note: use non-greedy match e.g. .+?
94
+ /xim ## do NOT allow multi-line - why? why not?
95
+
96
+ CLASS_ATTR_REGEX = /\s*
97
+ class=('|")(.+?)\1 ## note: use non-greedy match e.g. .+?
98
+ /xim ## do NOT allow multi-line - why? why not?
99
+
100
+ ##
101
+ ## <div>
102
+ ## <span class='category'>country comparison to the world: </span>
103
+ ## <span class='category_data'>[[191]]</span>
104
+ ## </div>
105
+ ##
106
+ ## <span class='category'>country comparison to the world: </span>
107
+ ## <span class='category_data'><a href='../rankorder/2147rank.html#au'>114</a></span>
108
+
109
+
110
+ ## todo: add enclosing div too!!!
111
+
112
+ COUNTRY_COMPARISON_REGEX = /
113
+ <div>
114
+ <span \s class='category'[^>]*>
115
+ country \s comparison \s to \s the \s world: \s*
116
+ <\/span>
117
+ \s*
118
+ <span \s class='category_data'[^>]*>
119
+ \s*
120
+ <a \s [^>]+>
121
+ .+?
122
+ <\/a>
123
+ \s*
124
+ <\/span>
125
+ <\/div>
126
+ /xim
127
+
128
+ ##
129
+ ## <div class='wrap'>
130
+ ## <div class='audio-player'>
131
+ ## <audio id='audio-player-1' class='my-audio-player' src='../anthems/AU.mp3' type='audio/mp3' controls='controls'>
132
+ ## </audio>
133
+ ## </div></div>
134
+
135
+
136
+ AUDIO_PLAYER_REGEX = /
137
+ <div \s class='wrap'>
138
+ <div \s class='audio-player'>
139
+ <audio \s [^>]+>
140
+ <\/audio>
141
+ <\/div>
142
+ <\/div>
143
+ /xim
144
+
145
+ def sanitize_profile( html )
146
+
147
+ html = html.gsub( STYLE_ATTR_REGEX ) do |m|
148
+ puts "remove style attr:"
149
+ puts "#{m}"
150
+ ''
151
+ end
152
+
153
+ html = html.gsub( AUDIO_PLAYER_REGEX ) do |m|
154
+ puts "remove audio player:"
155
+ puts "#{m}"
156
+ ''
157
+ end
158
+
159
+
160
+ html = html.gsub( COUNTRY_COMPARISON_REGEX ) do |m|
161
+ puts "remove country comparison:"
162
+ puts "#{m}"
163
+ ''
164
+ end
165
+
166
+ ## remove/cleanup anchors (a href)
167
+ html = html.gsub( /<a\s+href[^>]*>(.+?)<\/a>/im ) do |_| ## note: use .+? non-greedy match
168
+ puts " replace anchor (a) href >#{$1}<"
169
+
170
+ inner_text = $1.dup ## keep a copy
171
+ if inner_text =~ /<img/ ## if includes image remove
172
+ puts " remove image in anchor"
173
+ ''
174
+ else ## keep inner text
175
+ inner_text
176
+ end
177
+ end
178
+
179
+
180
+ ## remove all list e.g. ul/li
181
+ html = html.gsub( /<\/?(li|ul)[^>]*>/im ) do |m|
182
+ puts " remove list >#{m}<"
183
+ ''
184
+ end
185
+
186
+ ## clean-up class attrib e.g. remove unknown classes
187
+ html = html.gsub( CLASS_ATTR_REGEX ) do |m|
188
+ puts "cleanup class attr:"
189
+ puts "#{m}"
190
+
191
+ klasses = $2.split(' ')
192
+ klasses = klasses.select do |klass|
193
+ if ['region', 'category', 'category_data'].include?( klass )
194
+ true
195
+ else
196
+ puts " remove class #{klass}"
197
+ false
198
+ end
199
+ end
200
+
201
+ if klasses.size > 0
202
+ " class='#{klasses.join(' ')}'" ## note: add leading space!!
203
+ else
204
+ '' ## remove class attrib completely
205
+ end
206
+ end
207
+
208
+ html
209
+ end
210
+
211
+
212
+ end # class Sanitizer
213
+
214
+ end # module Factbook