factbook 0.1.3 → 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (46) hide show
  1. checksums.yaml +7 -0
  2. data/Manifest.txt +34 -22
  3. data/README.md +8 -3
  4. data/Rakefile +2 -263
  5. data/data/codes.csv +262 -0
  6. data/data/comparisons.csv +75 -0
  7. data/lib/factbook/builder.rb +214 -0
  8. data/lib/factbook/builder_item.rb +93 -0
  9. data/lib/factbook/codes.rb +119 -0
  10. data/lib/factbook/comparisons.rb +50 -0
  11. data/lib/factbook/page.rb +103 -303
  12. data/lib/factbook/sanitizer.rb +214 -0
  13. data/lib/factbook/sect.rb +29 -196
  14. data/lib/factbook/subsect.rb +18 -0
  15. data/lib/factbook/table.rb +52 -0
  16. data/lib/factbook/utils.rb +85 -0
  17. data/lib/factbook/utils_info.rb +102 -0
  18. data/lib/factbook/version.rb +4 -3
  19. data/lib/factbook.rb +23 -1
  20. data/test/data/au.html +579 -0
  21. data/test/data/au.yml +8 -0
  22. data/test/data/be.html +596 -0
  23. data/test/data/be.yml +8 -0
  24. data/test/data/src/au.html +2006 -0
  25. data/test/data/src/be.html +2011 -0
  26. data/test/helper.rb +0 -4
  27. data/test/test_builder.rb +37 -0
  28. data/test/test_codes.rb +76 -0
  29. data/test/test_comparisons.rb +19 -0
  30. data/test/test_fields.rb +21 -18
  31. data/test/test_item_builder.rb +99 -0
  32. data/test/test_json.rb +17 -20
  33. data/test/test_page.rb +18 -10
  34. data/test/test_sanitizer.rb +35 -0
  35. metadata +68 -49
  36. data/.gemtest +0 -0
  37. data/test/data/countrytemplate_au.html +0 -4179
  38. data/test/data/countrytemplate_be.html +0 -4260
  39. data/test/data/countrytemplate_br.html +0 -4366
  40. data/test/data/countrytemplate_ee.html +0 -2999
  41. data/test/data/countrytemplate_ls.html +0 -2728
  42. data/test/data/countrytemplate_mx.html +0 -4397
  43. data/test/data/countrytemplate_vt.html +0 -1726
  44. data/test/data/countrytemplate_xx.html +0 -2898
  45. data/test/test_page_old.rb +0 -478
  46. data/test/test_strip.rb +0 -66
data/lib/factbook/page.rb CHANGED
@@ -1,303 +1,103 @@
1
- # encoding: utf-8
2
-
3
- module Factbook
4
-
5
-
6
- class Page
7
- include LogUtils::Logging
8
-
9
- ## standard version
10
- ## SITE_BASE = 'https://www.cia.gov/library/publications/the-world-factbook/geos/{code}.html'
11
-
12
- ## -- use text (low-bandwidth) version
13
- ## e.g. www.cia.gov/library/publications/the-world-factbook/geos/countrytemplate_br.html
14
- SITE_BASE = 'https://www.cia.gov/library/publications/the-world-factbook/geos/countrytemplate_{code}.html'
15
-
16
- def initialize( code, opts={} )
17
- ## note: requires factbook country code
18
- # e.g. austria is au
19
- # germany is gm and so on
20
- @code = code
21
-
22
- ### rename fields to format option?? why? why not? e.g. :format => 'long' ??
23
- @opts = opts # fields: full|long|keep|std|?? -- find a good name for the option keeping field names as is
24
-
25
- @html = nil
26
- @doc = nil
27
- @sects = nil
28
- @data = nil
29
- end
30
-
31
- def doc
32
- @doc ||= Nokogiri::HTML( html )
33
- end
34
-
35
- def to_json( opts={} )
36
- ## convenience helper for data.to_json
37
- if opts[:pretty] || opts[:pp]
38
- JSON.pretty_generate( data )
39
- else
40
- data.to_json
41
- end
42
- end
43
-
44
-
45
- def [](key) ### convenience shortcut
46
- # lets you use
47
- # page['geo']
48
- # instead of
49
- # page.data['geo']
50
-
51
- ## fix: use delegate data, [] from forwardable lib - why?? why not??
52
-
53
- data[key]
54
- end
55
-
56
-
57
- def data
58
- if @data.nil?
59
- @data = {}
60
-
61
- if @opts[:header] ## include (leading) header section ??
62
-
63
- header_key = @opts[:fields] ? 'Header' : 'header'
64
- last_built_key = @opts[:fields] ? 'last built' : 'last_built'
65
-
66
- @data[header_key] = {
67
- 'code' => @code,
68
- 'generator' => "factbook/#{VERSION}",
69
- last_built_key => "#{Time.now}",
70
- }
71
- end
72
-
73
- sects.each_with_index do |sect,i|
74
- logger.debug "############################"
75
- logger.debug "### [#{i}] stats sect >#{sect.title}<: "
76
-
77
- @data[ sect.title ] = sect.data
78
- end
79
- end
80
- @data
81
- end
82
-
83
-
84
- def sects
85
- if @sects.nil?
86
- ## split html into sections
87
- ## lets us avoids errors w/ (wrongly) nested tags
88
-
89
- ## check opts for using long or short category/field names
90
- divs = [
91
- [ @opts[:fields] ? 'Introduction' : 'intro', '<div id="CollapsiblePanel1_Intro"' ],
92
- [ @opts[:fields] ? 'Geography' : 'geo', '<div id="CollapsiblePanel1_Geo"' ],
93
- [ @opts[:fields] ? 'People and Society' : 'people', '<div id="CollapsiblePanel1_People"' ],
94
- [ @opts[:fields] ? 'Government' : 'govt', '<div id="CollapsiblePanel1_Govt"' ],
95
- [ @opts[:fields] ? 'Economy' : 'econ', '<div id="CollapsiblePanel1_Econ"' ],
96
- [ @opts[:fields] ? 'Energy' : 'energy', '<div id="CollapsiblePanel1_Energy"' ],
97
- [ @opts[:fields] ? 'Communications' : 'comm', '<div id="CollapsiblePanel1_Comm"' ],
98
- [ @opts[:fields] ? 'Transportation' : 'trans', '<div id="CollapsiblePanel1_Trans"' ],
99
- [ @opts[:fields] ? 'Military' : 'military', '<div id="CollapsiblePanel1_Military"'],
100
- [ @opts[:fields] ? 'Transnational Issues': 'issues', '<div id="CollapsiblePanel1_Issues"' ]
101
- ]
102
-
103
- indexes = []
104
-
105
- ## note:
106
- ## skip missing sections (w/ warning)
107
- ## e.g. Vatican (Holy See), Liechtenstein etc. have no Energy section, for example
108
-
109
- divs.each_with_index do |rec,i|
110
- title = rec[0]
111
- div = rec[1]
112
- p = html.index( div )
113
- if p.nil?
114
- ## issue warning: if not found
115
- logger.warn "***!!! section not found -- #{div} --; skipping"
116
- else
117
- logger.debug " found section #{i} @ #{p}"
118
- indexes << [title,p]
119
- end
120
- end
121
-
122
- @sects = []
123
-
124
- indexes.each_with_index do |rec,i|
125
- title = rec[0]
126
- from = rec[1]
127
-
128
- # is last entry? if yes use -1 otherewise pos
129
- # note: subtract one (-1) from pos unless end-of-string (-1)
130
- to = indexes[i+1].nil? ? -1 : indexes[i+1][1]-1
131
-
132
- ## todo: check that from is smaller than to
133
- logger.debug " cut section #{i} [#{from}..#{to}]"
134
- @sects << Sect.new( title, html[ from..to ], @opts )
135
-
136
- ##if i==0 || i==1
137
- ## puts "debug sect #{i}:"
138
- ## puts ">>>|||#{html[ from..to ]}|||<<<"
139
- ##end
140
- end
141
- end
142
-
143
- @sects
144
- end
145
-
146
- def html=(html)
147
- ## for debugging n testing
148
- ## lets you set html (no need to fetch via net)
149
- @html = html
150
- end
151
-
152
- def html
153
- if @html.nil?
154
- @html = fetch()
155
-
156
- ### remove everything up to
157
- ## <div id="countryInfo" style="display: none;">
158
- ## remove everything starting w/ footer
159
- ## remove head !!!
160
- ## in body remove header n footer
161
-
162
- ## remove inline script
163
- @html = @html.gsub( /<script[^>]*>.*?<\/script>/m ) do |m|
164
- puts "remove script:"
165
- puts "#{m}"
166
- ''
167
- end
168
-
169
- ## remove inline style
170
- @html = @html.gsub( /<style[^>]*>.*?<\/style>/m ) do |m|
171
- puts "remove style:"
172
- puts "#{m}"
173
- ''
174
- end
175
-
176
- ## remove link
177
- link_regex = /<link[^>]+>/
178
- @html = @html.gsub( link_regex ) do |m|
179
- puts "remove link:"
180
- puts "#{m}"
181
- ''
182
- end
183
-
184
- div_country_info_regex = /<div id="countryInfo"\s*>/
185
- ## remove everything before <div id="countryInfo" >
186
- pos = @html.index( div_country_info_regex )
187
- if pos # not nil, false
188
- @html = @html[pos..-1]
189
- end
190
-
191
- ## remove country comparison
192
- ## e.g. <span class="category" >country comparison to the world:</span>
193
- ## <span class="category_data">
194
- ## <a href="../rankorder/2147rank.html?countryname=Brazil&countrycode=br&regionCode=soa&rank=5#br" onMouseDown="" title="Country comparison to the world" alt="Country comparison to the world">
195
- ## 5
196
- ## </a>
197
- ## </span>
198
-
199
- ##
200
- ##
201
- ## <span class="category" style="padding-left:7px;">country comparison to the world:</span> <span class="category_data">
202
- ## <a href="../rankorder/2147rank.html?countryname=Brazil&countrycode=br&regionCode=soa&rank=5#br" onMouseDown="" title="Country comparison to the world" alt="Country comparison to the world"> 5 </a> </span>
203
- ##
204
-
205
- country_comparison_regex = /
206
- <span \s class="category"[^>]*>
207
- country \s comparison \s to \s the \s world:
208
- <\/span>
209
- \s*
210
- <span \s class="category_data"[^>]*>
211
- \s*
212
- <a \s [^>]+>
213
- .+?
214
- <\/a>
215
- \s*
216
- <\/span>
217
- /xm
218
-
219
- @html = @html.gsub( country_comparison_regex ) do |m|
220
- puts "remove country comparison:"
221
- puts "#{m}"
222
- ''
223
- end
224
-
225
- style_attr_regex = /\s*style="[^"]+"/
226
- @html = @html.gsub( style_attr_regex ) do |m|
227
- puts "remove style attr:"
228
- puts "#{m}"
229
- ''
230
- end
231
-
232
- ## <tr height="22">
233
- ## <td class="category_data"></td>
234
- ## </tr>
235
- tr_empty_regex = /
236
- <tr[^>]*>
237
- \s*
238
- <td[^>]*> \s* <\/td>
239
- \s*
240
- <\/tr>
241
- /xm
242
- @html = @html.gsub( tr_empty_regex ) do |m|
243
- puts "remove tr emtpy:"
244
- puts "#{m}"
245
- ''
246
- end
247
-
248
- ## remove world leader website promo
249
- ## <span class="category">(For more information visit the
250
- ## <a href="/library/publications/world-leaders-1/index.html" target="_blank">World Leaders website</a>&nbsp;
251
- ## <img src="../graphics/soa_newwindow.gif" alt="Opens in New Window" title="Opens in New Window" border="0"/>)
252
- ## </span>
253
- world_leaders_website_regex = /
254
- <span \s class="category"[^>]*>
255
- \(
256
- For \s more \s information \s
257
- .+? ## non-greedy (smallest possible match
258
- \)
259
- <\/span>
260
- /xm
261
- @html = @html.gsub( world_leaders_website_regex ) do |m|
262
- puts "remove world leader website promo:"
263
- puts "#{m}"
264
- ''
265
- end
266
-
267
- end
268
- @html
269
- end
270
-
271
- private
272
- def fetch
273
- uri_string = SITE_BASE.gsub( '{code}', @code )
274
-
275
- worker = Fetcher::Worker.new
276
- response = worker.get_response( uri_string )
277
-
278
- if response.code == '200'
279
- t = response.body
280
- ###
281
- # NB: Net::HTTP will NOT set encoding UTF-8 etc.
282
- # will mostly be ASCII
283
- # - try to change encoding to UTF-8 ourselves
284
- logger.debug "t.encoding.name (before): #{t.encoding.name}"
285
- #####
286
- # NB: ASCII-8BIT == BINARY == Encoding Unknown; Raw Bytes Here
287
-
288
- ## NB:
289
- # for now "hardcoded" to utf8 - what else can we do?
290
- # - note: force_encoding will NOT change the chars only change the assumed encoding w/o translation
291
- t = t.force_encoding( Encoding::UTF_8 )
292
- logger.debug "t.encoding.name (after): #{t.encoding.name}"
293
- ## pp t
294
- t
295
- else
296
- logger.error "fetch HTTP - #{response.code} #{response.message}"
297
- nil
298
- end
299
- end
300
-
301
- end # class Page
302
-
303
- end # module Factbook
1
+ # encoding: utf-8
2
+
3
+ module Factbook
4
+
5
+
6
+ ## note:
7
+ ## some factbook pages with chrome (headers, footers, etc.)
8
+ ## are NOT valid utf-8, thus,
9
+ ## treat page as is (e.g. ASCII8BIT)
10
+ #
11
+ # only convert to utf8 when header and footer got stripped
12
+
13
+ ##
14
+ ## be/benin:
15
+ ## Key Force or FC [Lazare S?xx?HOU?xx?TO] -- two invalid byte code chars in Political parties and leaders:
16
+ #
17
+ ## in Western/Windows-1252 leads to FC [Lazare SÈHOUÉTO];
18
+ # Lazare Sèhouéto
19
+ #
20
+ # looks good - use (assume) Windows-1252 ????
21
+
22
+ ##
23
+ # check for is ascii 7-bit ??? if yes -noworries
24
+ # if not, log number of chars not using ascii 7-bit
25
+
26
+
27
+
28
+ class Page
29
+ include LogUtils::Logging
30
+
31
+ attr_accessor :sects
32
+
33
+ def initialize
34
+ @sects = []
35
+ end
36
+
37
+ def [](key) ### convenience shortcut
38
+ # lets you use
39
+ # page['geo']
40
+ # instead of
41
+ # page.data['geo']
42
+
43
+ ## fix: use delegate data, [] from forwardable lib - why?? why not??
44
+
45
+ data[key]
46
+ end
47
+
48
+
49
+ def data
50
+ ## note: cache data hash on first build for now
51
+ if @data.nil?
52
+ ## convert sects to hash
53
+ @data = {}
54
+
55
+ sects.each_with_index do |sect,i|
56
+ @data[ sect.title ] = sect.data
57
+ end
58
+ end
59
+ @data
60
+ end
61
+
62
+
63
+ =begin
64
+ def self.from_url( cc, cn )
65
+ html_ascii = PageFetcher.new.fetch( cc )
66
+ self.new( cc, cn, html_ascii )
67
+ end
68
+
69
+ def self.from_file( cc, cn, opts={} )
70
+ input_dir = opts[:input_dir] || '.'
71
+ html_ascii = File.read( "#{input_dir}/#{cc}.html" ) ## fix/todo: use ASCII8BIT/binary reader
72
+ self.new( cc, cn, html_ascii )
73
+ end
74
+ =end
75
+
76
+
77
+ end # class Page
78
+
79
+
80
+ =begin
81
+ class PageFetcher
82
+
83
+ def fetch( cc )
84
+ worker = Fetcher::Worker.new
85
+ factbook_base = 'https://www.cia.gov/library/publications/the-world-factbook/geos'
86
+
87
+ res = worker.get_response( "#{factbook_base}/#{cc}.html" )
88
+
89
+ # on error throw exception - why? why not??
90
+ if res.code != '200'
91
+ raise Fetcher::HttpError.new( res.code, res.message )
92
+ end
93
+
94
+ ###
95
+ # Note: Net::HTTP will NOT set encoding UTF-8 etc.
96
+ # will be set to ASCII-8BIT == BINARY == Encoding Unknown; Raw Bytes Here
97
+ html = res.body.to_s
98
+ end
99
+ end # PageFetcher
100
+ =end
101
+
102
+
103
+ end # module Factbook
@@ -0,0 +1,214 @@
1
+ # encoding: utf-8
2
+
3
+ module Factbook
4
+
5
+ PageInfo = Struct.new( :country_code,
6
+ :country_name,
7
+ :country_affiliation,
8
+ :region_code,
9
+ :region_name,
10
+ :last_updated )
11
+
12
+ class Sanitizer
13
+ include LogUtils::Logging
14
+ include Utils ## pulls in encode_utf8, ...
15
+
16
+
17
+ def sanitize( html_ascii )
18
+ ## todo: add option for (html source) encoding - why?? why not??
19
+
20
+ ## note:
21
+ ## returns 1) html profile withouth headers, footers, scripts,etc.
22
+ ## 2) page (meta) info e.g. country_name, country_code, last_updated, etc.
23
+ ## 3) errors e.g. list of errors e.g. endcoding errors (invalid byte sequence etc.)
24
+
25
+ page_info = PageInfo.new
26
+
27
+ h = find_page_info( html_ascii )
28
+ page_info.country_code = h[:country_code]
29
+ page_info.country_name = h[:country_name]
30
+ page_info.country_affiliation = h[:country_affiliation]
31
+ page_info.region_code = h[:region_code]
32
+ page_info.region_name = h[:region_name]
33
+
34
+ page_info.last_updated = find_page_last_updated( html_ascii )
35
+
36
+
37
+ html_profile_ascii = find_country_profile( html_ascii ) ## cut-off headers, footers, scripts, etc.
38
+
39
+ ## todo/fix: assume windows 12xx encoding!!!! for factbook - try
40
+ html, errors = encode_utf8( html_profile_ascii ) ## change encoding to utf-8 (from binary/ascii8bit)
41
+
42
+ html = sanitize_profile( html )
43
+
44
+ [html, page_info, errors]
45
+ end
46
+
47
+
48
+
49
+ BEGIN_FACTS_REGEX = /<ul\s+
50
+ class="expandcollapse">
51
+ /xim ## ignore case; multi-line
52
+
53
+ END_FACTS_REGEX = /<\/li>\s*
54
+ <\/ul>\s*
55
+ <\/tbody>\s*
56
+ <\/table>
57
+ /xim ## ignore case; multi-line
58
+
59
+
60
+ def find_country_profile( html )
61
+ ####
62
+ ## remove header (everything before)
63
+ ## <ul class="expandcollapse">
64
+
65
+ pos = html.index( BEGIN_FACTS_REGEX )
66
+ fail "*** no begin facts marker found for page" if pos.nil?
67
+
68
+ puts " bingo - found BEGIN_FACTS on pos #{pos}"
69
+ html = html[pos..-1]
70
+
71
+ pp html[0..100]
72
+
73
+ ###
74
+ ## remove footer
75
+ ## assume everthings after (last list item in unorder list inside a table body)
76
+ ## </li>
77
+ ## </ul>
78
+ ## </tbody></table>
79
+
80
+ pos = html.index( END_FACTS_REGEX )
81
+ fail "*** no end facts marker found for page" if pos.nil?
82
+
83
+ puts " bingo - found END_FACTS on pos #{pos}"
84
+ html = html[0...pos] + "</li></ul>\n" ## note: use ... (not .. to cut-off pos)
85
+
86
+ pp html[-200..-1]
87
+ html
88
+ end
89
+
90
+
91
+
92
+ STYLE_ATTR_REGEX = /\s*
93
+ style=('|").+?\1 ## note: use non-greedy match e.g. .+?
94
+ /xim ## do NOT allow multi-line - why? why not?
95
+
96
+ CLASS_ATTR_REGEX = /\s*
97
+ class=('|")(.+?)\1 ## note: use non-greedy match e.g. .+?
98
+ /xim ## do NOT allow multi-line - why? why not?
99
+
100
+ ##
101
+ ## <div>
102
+ ## <span class='category'>country comparison to the world: </span>
103
+ ## <span class='category_data'>[[191]]</span>
104
+ ## </div>
105
+ ##
106
+ ## <span class='category'>country comparison to the world: </span>
107
+ ## <span class='category_data'><a href='../rankorder/2147rank.html#au'>114</a></span>
108
+
109
+
110
+ ## todo: add enclosing div too!!!
111
+
112
+ COUNTRY_COMPARISON_REGEX = /
113
+ <div>
114
+ <span \s class='category'[^>]*>
115
+ country \s comparison \s to \s the \s world: \s*
116
+ <\/span>
117
+ \s*
118
+ <span \s class='category_data'[^>]*>
119
+ \s*
120
+ <a \s [^>]+>
121
+ .+?
122
+ <\/a>
123
+ \s*
124
+ <\/span>
125
+ <\/div>
126
+ /xim
127
+
128
+ ##
129
+ ## <div class='wrap'>
130
+ ## <div class='audio-player'>
131
+ ## <audio id='audio-player-1' class='my-audio-player' src='../anthems/AU.mp3' type='audio/mp3' controls='controls'>
132
+ ## </audio>
133
+ ## </div></div>
134
+
135
+
136
+ AUDIO_PLAYER_REGEX = /
137
+ <div \s class='wrap'>
138
+ <div \s class='audio-player'>
139
+ <audio \s [^>]+>
140
+ <\/audio>
141
+ <\/div>
142
+ <\/div>
143
+ /xim
144
+
145
+ def sanitize_profile( html )
146
+
147
+ html = html.gsub( STYLE_ATTR_REGEX ) do |m|
148
+ puts "remove style attr:"
149
+ puts "#{m}"
150
+ ''
151
+ end
152
+
153
+ html = html.gsub( AUDIO_PLAYER_REGEX ) do |m|
154
+ puts "remove audio player:"
155
+ puts "#{m}"
156
+ ''
157
+ end
158
+
159
+
160
+ html = html.gsub( COUNTRY_COMPARISON_REGEX ) do |m|
161
+ puts "remove country comparison:"
162
+ puts "#{m}"
163
+ ''
164
+ end
165
+
166
+ ## remove/cleanup anchors (a href)
167
+ html = html.gsub( /<a\s+href[^>]*>(.+?)<\/a>/im ) do |_| ## note: use .+? non-greedy match
168
+ puts " replace anchor (a) href >#{$1}<"
169
+
170
+ inner_text = $1.dup ## keep a copy
171
+ if inner_text =~ /<img/ ## if includes image remove
172
+ puts " remove image in anchor"
173
+ ''
174
+ else ## keep inner text
175
+ inner_text
176
+ end
177
+ end
178
+
179
+
180
+ ## remove all list e.g. ul/li
181
+ html = html.gsub( /<\/?(li|ul)[^>]*>/im ) do |m|
182
+ puts " remove list >#{m}<"
183
+ ''
184
+ end
185
+
186
+ ## clean-up class attrib e.g. remove unknown classes
187
+ html = html.gsub( CLASS_ATTR_REGEX ) do |m|
188
+ puts "cleanup class attr:"
189
+ puts "#{m}"
190
+
191
+ klasses = $2.split(' ')
192
+ klasses = klasses.select do |klass|
193
+ if ['region', 'category', 'category_data'].include?( klass )
194
+ true
195
+ else
196
+ puts " remove class #{klass}"
197
+ false
198
+ end
199
+ end
200
+
201
+ if klasses.size > 0
202
+ " class='#{klasses.join(' ')}'" ## note: add leading space!!
203
+ else
204
+ '' ## remove class attrib completely
205
+ end
206
+ end
207
+
208
+ html
209
+ end
210
+
211
+
212
+ end # class Sanitizer
213
+
214
+ end # module Factbook