factbook 1.1.1 → 2.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (64) hide show
  1. checksums.yaml +4 -4
  2. data/{HISTORY.md → CHANGELOG.md} +3 -3
  3. data/Manifest.txt +1 -58
  4. data/README.md +50 -575
  5. data/Rakefile +29 -33
  6. data/lib/factbook.rb +8 -75
  7. metadata +20 -114
  8. data/data/attributes.yml +0 -337
  9. data/data/categories.csv +0 -164
  10. data/data/codes.csv +0 -262
  11. data/data/codesxref.csv +0 -280
  12. data/data/comparisons.csv +0 -75
  13. data/lib/factbook/almanac.rb +0 -72
  14. data/lib/factbook/attributes.rb +0 -74
  15. data/lib/factbook/builder.rb +0 -214
  16. data/lib/factbook/builder_item.rb +0 -92
  17. data/lib/factbook/builder_json.rb +0 -79
  18. data/lib/factbook/codes.rb +0 -119
  19. data/lib/factbook/comparisons.rb +0 -50
  20. data/lib/factbook/counter.rb +0 -48
  21. data/lib/factbook/db/importer.rb +0 -92
  22. data/lib/factbook/db/models.rb +0 -11
  23. data/lib/factbook/db/schema.rb +0 -36
  24. data/lib/factbook/normalize.rb +0 -43
  25. data/lib/factbook/page.rb +0 -185
  26. data/lib/factbook/page_info.rb +0 -12
  27. data/lib/factbook/reader_json.rb +0 -51
  28. data/lib/factbook/sanitizer.rb +0 -207
  29. data/lib/factbook/sect.rb +0 -29
  30. data/lib/factbook/subsect.rb +0 -18
  31. data/lib/factbook/table.rb +0 -52
  32. data/lib/factbook/utils.rb +0 -85
  33. data/lib/factbook/utils_info.rb +0 -102
  34. data/lib/factbook/version.rb +0 -22
  35. data/script/almanac.rb +0 -48
  36. data/script/attributes.rb +0 -34
  37. data/script/build.rb +0 -28
  38. data/script/counter.rb +0 -145
  39. data/script/json.rb +0 -18
  40. data/script/testbr.rb +0 -33
  41. data/script/testcodes.rb +0 -11
  42. data/test/data/au.html +0 -579
  43. data/test/data/au.yml +0 -8
  44. data/test/data/be.html +0 -596
  45. data/test/data/be.yml +0 -8
  46. data/test/data/json/au.json +0 -892
  47. data/test/data/src/au.html +0 -2006
  48. data/test/data/src/be.html +0 -2011
  49. data/test/helper.rb +0 -11
  50. data/test/test_attribs.rb +0 -82
  51. data/test/test_attribs_def.rb +0 -20
  52. data/test/test_builder.rb +0 -35
  53. data/test/test_codes.rb +0 -76
  54. data/test/test_comparisons.rb +0 -19
  55. data/test/test_convert.rb +0 -30
  56. data/test/test_counter.rb +0 -31
  57. data/test/test_fields.rb +0 -52
  58. data/test/test_importer.rb +0 -55
  59. data/test/test_item_builder.rb +0 -99
  60. data/test/test_json.rb +0 -44
  61. data/test/test_json_builder.rb +0 -25
  62. data/test/test_normalize.rb +0 -23
  63. data/test/test_page.rb +0 -38
  64. data/test/test_sanitizer.rb +0 -35
@@ -1,185 +0,0 @@
1
- # encoding: utf-8
2
-
3
- module Factbook
4
-
5
-
6
- ## note:
7
- ## some factbook pages with chrome (headers, footers, etc.)
8
- ## are NOT valid utf-8, thus,
9
- ## treat page as is (e.g. ASCII8BIT)
10
- #
11
- # only convert to utf8 when header and footer got stripped
12
-
13
- ##
14
- ## be/benin:
15
- ## Key Force or FC [Lazare S?xx?HOU?xx?TO] -- two invalid byte code chars in Political parties and leaders:
16
- #
17
- ## in Western/Windows-1252 leads to FC [Lazare SÈHOUÉTO];
18
- # Lazare Sèhouéto
19
- #
20
- # looks good - use (assume) Windows-1252 ????
21
-
22
- ##
23
- # check for is ascii 7-bit ??? if yes -noworries
24
- # if not, log number of chars not using ascii 7-bit
25
-
26
-
27
-
28
- class Page
29
- include LogUtils::Logging
30
-
31
- attr_reader :sects ## "structured" access e.g. sects/subsects/etc.
32
- attr_reader :info ## meta info e.g. country_code, country_name, region_name, last_updated, etc.
33
- attr_reader :data ## "plain" access with vanilla hash
34
-
35
-
36
- ## standard version (note: requires https)
37
- SITE_BASE = 'https://www.cia.gov/library/publications/the-world-factbook/geos/{code}.html'
38
-
39
- def initialize( code, opts={} )
40
- ### keep code - why? why not?? (use page_info/info e.g. info.country_code??)
41
-
42
- if opts[:json]
43
- json = opts[:json] ## note: json is (still) a string/text (NOT yet parsed to structured data)
44
- b = JsonBuilder.from_string( json )
45
- else ## assume html
46
- if opts[:html] ## note: expects ASCII-7BIT/BINARY encoding
47
- ## for debugging and testing allow "custom" passed-in html page
48
- html = opts[:html]
49
- else
50
- url_string = SITE_BASE.gsub( '{code}', code )
51
- ## note: expects ASCII-7BIT/BINARY encoding
52
- html = fetch_page( url_string ) ## use PageFetcher class - why?? why not??
53
- end
54
- b = Builder.from_string( html )
55
- end
56
-
57
- @sects = b.sects
58
- @info = b.info
59
-
60
- ## todo/fix/quick hack:
61
- ## check for info opts hash entry - lets you overwrite page info
62
- ## -- use proper header to setup page info - why, why not??
63
- if opts[:info]
64
- info = opts[:info]
65
- @info = info
66
- end
67
-
68
- @data = {}
69
- @sects.each do |sect|
70
- @data[ sect.title ] = sect.data
71
- end
72
-
73
- self ## return self (check - not needed??)
74
- end
75
-
76
-
77
- def to_json( opts={} ) ## convenience helper for data.to_json; note: pretty print by default!
78
- if opts[:minify]
79
- data.to_json
80
- else
81
- ## was: -- opts[:pretty] || opts[:pp]
82
- JSON.pretty_generate( data ) ## note: pretty print by default!
83
- end
84
- end
85
-
86
-
87
- def [](key) ### convenience shortcut
88
- # lets you use
89
- # page['geo']
90
- # instead of
91
- # page.data['geo']
92
-
93
- ## fix: use delegate data, [] from forwardable lib - why?? why not??
94
-
95
- data[key]
96
- end
97
-
98
- ## add convenience (shortcut) accessors / attributes / fields / getters
99
-
100
- ATTRIBUTES.each do |attrib|
101
- ## e.g.
102
- ## def background() data['Introduction']['Background']['text']; end
103
- ## def location() data['Geography']['Location']['text']; end
104
- ## etc.
105
- if attrib.path.size == 1
106
- define_method attrib.name.to_sym do
107
- @data.fetch( attrib.category, {} ).
108
- fetch( attrib.path[0], {} )['text']
109
- end
110
- else ## assume size 2 for now
111
- define_method attrib.name.to_sym do
112
- @data.fetch( attrib.category, {} ).
113
- fetch( attrib.path[0], {} ).
114
- fetch( attrib.path[1], {} )['text']
115
- end
116
- end
117
- end
118
-
119
-
120
- private
121
- def fetch_page( url_string )
122
-
123
- worker = Fetcher::Worker.new
124
- response = worker.get_response( url_string )
125
-
126
- if response.code == '200'
127
- t = response.body
128
- ###
129
- # NB: Net::HTTP will NOT set encoding UTF-8 etc.
130
- # will mostly be ASCII
131
- # - try to change encoding to UTF-8 ourselves
132
- logger.debug "t.encoding.name (before): #{t.encoding.name}"
133
- #####
134
- # NB: ASCII-8BIT == BINARY == Encoding Unknown; Raw Bytes Here
135
- t
136
- else
137
- logger.error "fetch HTTP - #{response.code} #{response.message}"
138
- ## todo/fix: raise http exception (see fetcher) -- why? why not??
139
- fail "fetch HTTP - #{response.code} #{response.message}"
140
- nil
141
- end
142
- end
143
-
144
-
145
- =begin
146
- def self.from_url( cc, cn )
147
- html_ascii = PageFetcher.new.fetch( cc )
148
- self.new( cc, cn, html_ascii )
149
- end
150
-
151
- def self.from_file( cc, cn, opts={} )
152
- input_dir = opts[:input_dir] || '.'
153
- html_ascii = File.read( "#{input_dir}/#{cc}.html" ) ## fix/todo: use ASCII8BIT/binary reader
154
- self.new( cc, cn, html_ascii )
155
- end
156
- =end
157
-
158
-
159
- end # class Page
160
-
161
-
162
- =begin
163
- class PageFetcher
164
-
165
- def fetch( cc )
166
- worker = Fetcher::Worker.new
167
- factbook_base = 'https://www.cia.gov/library/publications/the-world-factbook/geos'
168
-
169
- res = worker.get_response( "#{factbook_base}/#{cc}.html" )
170
-
171
- # on error throw exception - why? why not??
172
- if res.code != '200'
173
- raise Fetcher::HttpError.new( res.code, res.message )
174
- end
175
-
176
- ###
177
- # Note: Net::HTTP will NOT set encoding UTF-8 etc.
178
- # will be set to ASCII-8BIT == BINARY == Encoding Unknown; Raw Bytes Here
179
- html = res.body.to_s
180
- end
181
- end # PageFetcher
182
- =end
183
-
184
-
185
- end # module Factbook
@@ -1,12 +0,0 @@
1
- # encoding: utf-8
2
-
3
- module Factbook
4
-
5
- PageInfo = Struct.new( :country_code,
6
- :country_name,
7
- :country_affiliation,
8
- :region_code,
9
- :region_name,
10
- :last_updated )
11
-
12
- end # module Factbook
@@ -1,51 +0,0 @@
1
- # encoding: utf-8
2
-
3
- module Factbook
4
-
5
-
6
- class JsonPageReader
7
- def initialize( json_dir )
8
- @json_dir = json_dir
9
- end
10
-
11
- def read_page( code )
12
- path = "#{@json_dir}/#{region_to_slug(code.region)}/#{code.code}.json"
13
-
14
- puts "reading #{code.code} #{code.name} (#{code.region}) [#{path}]..."
15
- json = File.read( path )
16
-
17
- ## todo/fix/quick hack: for now until we have a proper header/meta/info section in json
18
- # add some page info from code struct
19
-
20
- info = PageInfo.new
21
- info.country_code = code.code
22
- info.country_name = code.name
23
- info.region_name = code.region
24
-
25
- page = Page.new( code.code, json: json, info: info )
26
- page
27
- end
28
-
29
- def read_pages( codes, limit: nil )
30
- pages = []
31
- i=0
32
- codes.each do |code|
33
- next if limit && i > limit ## for debugging just process first x entries
34
-
35
- pages << read_page( code )
36
- end
37
- pages
38
- end
39
-
40
- private
41
- def region_to_slug( text )
42
- ## change and => n
43
- ## change & => n
44
- ## change all spaces to => -
45
- ## e.g. East & Southeast Asia => east-n-southeast-asia
46
- ## Central America and Caribbean => central-america-n-caribbean
47
- text.downcase.gsub('and', 'n').gsub( '&', 'n' ).gsub( ' ', '-' )
48
- end
49
- end ## JsonPageReader
50
-
51
- end # module Factbook
@@ -1,207 +0,0 @@
1
- # encoding: utf-8
2
-
3
- module Factbook
4
-
5
- class Sanitizer
6
- include LogUtils::Logging
7
- include Utils ## pulls in encode_utf8, ...
8
-
9
-
10
- def sanitize( html_ascii )
11
- ## todo: add option for (html source) encoding - why?? why not??
12
-
13
- ## note:
14
- ## returns 1) html profile withouth headers, footers, scripts,etc.
15
- ## 2) page (meta) info e.g. country_name, country_code, last_updated, etc.
16
- ## 3) errors e.g. list of errors e.g. endcoding errors (invalid byte sequence etc.)
17
-
18
- page_info = PageInfo.new
19
-
20
- h = find_page_info( html_ascii )
21
- page_info.country_code = h[:country_code]
22
- page_info.country_name = h[:country_name]
23
- page_info.country_affiliation = h[:country_affiliation]
24
- page_info.region_code = h[:region_code]
25
- page_info.region_name = h[:region_name]
26
-
27
- page_info.last_updated = find_page_last_updated( html_ascii )
28
-
29
-
30
- html_profile_ascii = find_country_profile( html_ascii ) ## cut-off headers, footers, scripts, etc.
31
-
32
- ## todo/fix: assume windows 12xx encoding!!!! for factbook - try
33
- html, errors = encode_utf8( html_profile_ascii ) ## change encoding to utf-8 (from binary/ascii8bit)
34
-
35
- html = sanitize_profile( html )
36
-
37
- [html, page_info, errors]
38
- end
39
-
40
-
41
-
42
- BEGIN_FACTS_REGEX = /<ul\s+
43
- class="expandcollapse">
44
- /xim ## ignore case; multi-line
45
-
46
- END_FACTS_REGEX = /<\/li>\s*
47
- <\/ul>\s*
48
- <\/tbody>\s*
49
- <\/table>
50
- /xim ## ignore case; multi-line
51
-
52
-
53
- def find_country_profile( html )
54
- ####
55
- ## remove header (everything before)
56
- ## <ul class="expandcollapse">
57
-
58
- pos = html.index( BEGIN_FACTS_REGEX )
59
- fail "*** no begin facts marker found for page" if pos.nil?
60
-
61
- puts " bingo - found BEGIN_FACTS on pos #{pos}"
62
- html = html[pos..-1]
63
-
64
- pp html[0..100]
65
-
66
- ###
67
- ## remove footer
68
- ## assume everthings after (last list item in unorder list inside a table body)
69
- ## </li>
70
- ## </ul>
71
- ## </tbody></table>
72
-
73
- pos = html.index( END_FACTS_REGEX )
74
- fail "*** no end facts marker found for page" if pos.nil?
75
-
76
- puts " bingo - found END_FACTS on pos #{pos}"
77
- html = html[0...pos] + "</li></ul>\n" ## note: use ... (not .. to cut-off pos)
78
-
79
- pp html[-200..-1]
80
- html
81
- end
82
-
83
-
84
-
85
- STYLE_ATTR_REGEX = /\s*
86
- style=('|").+?\1 ## note: use non-greedy match e.g. .+?
87
- /xim ## do NOT allow multi-line - why? why not?
88
-
89
- CLASS_ATTR_REGEX = /\s*
90
- class=('|")(.+?)\1 ## note: use non-greedy match e.g. .+?
91
- /xim ## do NOT allow multi-line - why? why not?
92
-
93
- ##
94
- ## <div>
95
- ## <span class='category'>country comparison to the world: </span>
96
- ## <span class='category_data'>[[191]]</span>
97
- ## </div>
98
- ##
99
- ## <span class='category'>country comparison to the world: </span>
100
- ## <span class='category_data'><a href='../rankorder/2147rank.html#au'>114</a></span>
101
-
102
-
103
- ## todo: add enclosing div too!!!
104
-
105
- COUNTRY_COMPARISON_REGEX = /
106
- <div>
107
- <span \s class='category'[^>]*>
108
- country \s comparison \s to \s the \s world: \s*
109
- <\/span>
110
- \s*
111
- <span \s class='category_data'[^>]*>
112
- \s*
113
- <a \s [^>]+>
114
- .+?
115
- <\/a>
116
- \s*
117
- <\/span>
118
- <\/div>
119
- /xim
120
-
121
- ##
122
- ## <div class='wrap'>
123
- ## <div class='audio-player'>
124
- ## <audio id='audio-player-1' class='my-audio-player' src='../anthems/AU.mp3' type='audio/mp3' controls='controls'>
125
- ## </audio>
126
- ## </div></div>
127
-
128
-
129
- AUDIO_PLAYER_REGEX = /
130
- <div \s class='wrap'>
131
- <div \s class='audio-player'>
132
- <audio \s [^>]+>
133
- <\/audio>
134
- <\/div>
135
- <\/div>
136
- /xim
137
-
138
- def sanitize_profile( html )
139
-
140
- html = html.gsub( STYLE_ATTR_REGEX ) do |m|
141
- puts "remove style attr:"
142
- puts "#{m}"
143
- ''
144
- end
145
-
146
- html = html.gsub( AUDIO_PLAYER_REGEX ) do |m|
147
- puts "remove audio player:"
148
- puts "#{m}"
149
- ''
150
- end
151
-
152
-
153
- html = html.gsub( COUNTRY_COMPARISON_REGEX ) do |m|
154
- puts "remove country comparison:"
155
- puts "#{m}"
156
- ''
157
- end
158
-
159
- ## remove/cleanup anchors (a href)
160
- html = html.gsub( /<a\s+href[^>]*>(.+?)<\/a>/im ) do |_| ## note: use .+? non-greedy match
161
- puts " replace anchor (a) href >#{$1}<"
162
-
163
- inner_text = $1.dup ## keep a copy
164
- if inner_text =~ /<img/ ## if includes image remove
165
- puts " remove image in anchor"
166
- ''
167
- else ## keep inner text
168
- inner_text
169
- end
170
- end
171
-
172
-
173
- ## remove all list e.g. ul/li
174
- html = html.gsub( /<\/?(li|ul)[^>]*>/im ) do |m|
175
- puts " remove list >#{m}<"
176
- ''
177
- end
178
-
179
- ## clean-up class attrib e.g. remove unknown classes
180
- html = html.gsub( CLASS_ATTR_REGEX ) do |m|
181
- puts "cleanup class attr:"
182
- puts "#{m}"
183
-
184
- klasses = $2.split(' ')
185
- klasses = klasses.select do |klass|
186
- if ['region', 'category', 'category_data'].include?( klass )
187
- true
188
- else
189
- puts " remove class #{klass}"
190
- false
191
- end
192
- end
193
-
194
- if klasses.size > 0
195
- " class='#{klasses.join(' ')}'" ## note: add leading space!!
196
- else
197
- '' ## remove class attrib completely
198
- end
199
- end
200
-
201
- html
202
- end
203
-
204
-
205
- end # class Sanitizer
206
-
207
- end # module Factbook