factbook 1.1.1 → 2.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. checksums.yaml +4 -4
  2. data/{HISTORY.md → CHANGELOG.md} +3 -3
  3. data/Manifest.txt +1 -58
  4. data/README.md +50 -575
  5. data/Rakefile +29 -33
  6. data/lib/factbook.rb +8 -75
  7. metadata +20 -114
  8. data/data/attributes.yml +0 -337
  9. data/data/categories.csv +0 -164
  10. data/data/codes.csv +0 -262
  11. data/data/codesxref.csv +0 -280
  12. data/data/comparisons.csv +0 -75
  13. data/lib/factbook/almanac.rb +0 -72
  14. data/lib/factbook/attributes.rb +0 -74
  15. data/lib/factbook/builder.rb +0 -214
  16. data/lib/factbook/builder_item.rb +0 -92
  17. data/lib/factbook/builder_json.rb +0 -79
  18. data/lib/factbook/codes.rb +0 -119
  19. data/lib/factbook/comparisons.rb +0 -50
  20. data/lib/factbook/counter.rb +0 -48
  21. data/lib/factbook/db/importer.rb +0 -92
  22. data/lib/factbook/db/models.rb +0 -11
  23. data/lib/factbook/db/schema.rb +0 -36
  24. data/lib/factbook/normalize.rb +0 -43
  25. data/lib/factbook/page.rb +0 -185
  26. data/lib/factbook/page_info.rb +0 -12
  27. data/lib/factbook/reader_json.rb +0 -51
  28. data/lib/factbook/sanitizer.rb +0 -207
  29. data/lib/factbook/sect.rb +0 -29
  30. data/lib/factbook/subsect.rb +0 -18
  31. data/lib/factbook/table.rb +0 -52
  32. data/lib/factbook/utils.rb +0 -85
  33. data/lib/factbook/utils_info.rb +0 -102
  34. data/lib/factbook/version.rb +0 -22
  35. data/script/almanac.rb +0 -48
  36. data/script/attributes.rb +0 -34
  37. data/script/build.rb +0 -28
  38. data/script/counter.rb +0 -145
  39. data/script/json.rb +0 -18
  40. data/script/testbr.rb +0 -33
  41. data/script/testcodes.rb +0 -11
  42. data/test/data/au.html +0 -579
  43. data/test/data/au.yml +0 -8
  44. data/test/data/be.html +0 -596
  45. data/test/data/be.yml +0 -8
  46. data/test/data/json/au.json +0 -892
  47. data/test/data/src/au.html +0 -2006
  48. data/test/data/src/be.html +0 -2011
  49. data/test/helper.rb +0 -11
  50. data/test/test_attribs.rb +0 -82
  51. data/test/test_attribs_def.rb +0 -20
  52. data/test/test_builder.rb +0 -35
  53. data/test/test_codes.rb +0 -76
  54. data/test/test_comparisons.rb +0 -19
  55. data/test/test_convert.rb +0 -30
  56. data/test/test_counter.rb +0 -31
  57. data/test/test_fields.rb +0 -52
  58. data/test/test_importer.rb +0 -55
  59. data/test/test_item_builder.rb +0 -99
  60. data/test/test_json.rb +0 -44
  61. data/test/test_json_builder.rb +0 -25
  62. data/test/test_normalize.rb +0 -23
  63. data/test/test_page.rb +0 -38
  64. data/test/test_sanitizer.rb +0 -35
@@ -1,185 +0,0 @@
1
- # encoding: utf-8
2
-
3
- module Factbook
4
-
5
-
6
- ## note:
7
- ## some factbook pages with chrome (headers, footers, etc.)
8
- ## are NOT valid utf-8, thus,
9
- ## treat page as is (e.g. ASCII8BIT)
10
- #
11
- # only convert to utf8 when header and footer got stripped
12
-
13
- ##
14
- ## be/benin:
15
- ## Key Force or FC [Lazare S?xx?HOU?xx?TO] -- two invalid byte code chars in Political parties and leaders:
16
- #
17
- ## in Western/Windows-1252 leads to FC [Lazare SÈHOUÉTO];
18
- # Lazare Sèhouéto
19
- #
20
- # looks good - use (assume) Windows-1252 ????
21
-
22
- ##
23
- # check for is ascii 7-bit ??? if yes -noworries
24
- # if not, log number of chars not using ascii 7-bit
25
-
26
-
27
-
28
- class Page
29
- include LogUtils::Logging
30
-
31
- attr_reader :sects ## "structured" access e.g. sects/subsects/etc.
32
- attr_reader :info ## meta info e.g. country_code, country_name, region_name, last_updated, etc.
33
- attr_reader :data ## "plain" access with vanilla hash
34
-
35
-
36
- ## standard version (note: requires https)
37
- SITE_BASE = 'https://www.cia.gov/library/publications/the-world-factbook/geos/{code}.html'
38
-
39
- def initialize( code, opts={} )
40
- ### keep code - why? why not?? (use page_info/info e.g. info.country_code??)
41
-
42
- if opts[:json]
43
- json = opts[:json] ## note: json is (still) a string/text (NOT yet parsed to structured data)
44
- b = JsonBuilder.from_string( json )
45
- else ## assume html
46
- if opts[:html] ## note: expects ASCII-7BIT/BINARY encoding
47
- ## for debugging and testing allow "custom" passed-in html page
48
- html = opts[:html]
49
- else
50
- url_string = SITE_BASE.gsub( '{code}', code )
51
- ## note: expects ASCII-7BIT/BINARY encoding
52
- html = fetch_page( url_string ) ## use PageFetcher class - why?? why not??
53
- end
54
- b = Builder.from_string( html )
55
- end
56
-
57
- @sects = b.sects
58
- @info = b.info
59
-
60
- ## todo/fix/quick hack:
61
- ## check for info opts hash entry - lets you overwrite page info
62
- ## -- use proper header to setup page info - why, why not??
63
- if opts[:info]
64
- info = opts[:info]
65
- @info = info
66
- end
67
-
68
- @data = {}
69
- @sects.each do |sect|
70
- @data[ sect.title ] = sect.data
71
- end
72
-
73
- self ## return self (check - not needed??)
74
- end
75
-
76
-
77
- def to_json( opts={} ) ## convenience helper for data.to_json; note: pretty print by default!
78
- if opts[:minify]
79
- data.to_json
80
- else
81
- ## was: -- opts[:pretty] || opts[:pp]
82
- JSON.pretty_generate( data ) ## note: pretty print by default!
83
- end
84
- end
85
-
86
-
87
- def [](key) ### convenience shortcut
88
- # lets you use
89
- # page['geo']
90
- # instead of
91
- # page.data['geo']
92
-
93
- ## fix: use delegate data, [] from forwardable lib - why?? why not??
94
-
95
- data[key]
96
- end
97
-
98
- ## add convenience (shortcut) accessors / attributes / fields / getters
99
-
100
- ATTRIBUTES.each do |attrib|
101
- ## e.g.
102
- ## def background() data['Introduction']['Background']['text']; end
103
- ## def location() data['Geography']['Location']['text']; end
104
- ## etc.
105
- if attrib.path.size == 1
106
- define_method attrib.name.to_sym do
107
- @data.fetch( attrib.category, {} ).
108
- fetch( attrib.path[0], {} )['text']
109
- end
110
- else ## assume size 2 for now
111
- define_method attrib.name.to_sym do
112
- @data.fetch( attrib.category, {} ).
113
- fetch( attrib.path[0], {} ).
114
- fetch( attrib.path[1], {} )['text']
115
- end
116
- end
117
- end
118
-
119
-
120
- private
121
- def fetch_page( url_string )
122
-
123
- worker = Fetcher::Worker.new
124
- response = worker.get_response( url_string )
125
-
126
- if response.code == '200'
127
- t = response.body
128
- ###
129
- # NB: Net::HTTP will NOT set encoding UTF-8 etc.
130
- # will mostly be ASCII
131
- # - try to change encoding to UTF-8 ourselves
132
- logger.debug "t.encoding.name (before): #{t.encoding.name}"
133
- #####
134
- # NB: ASCII-8BIT == BINARY == Encoding Unknown; Raw Bytes Here
135
- t
136
- else
137
- logger.error "fetch HTTP - #{response.code} #{response.message}"
138
- ## todo/fix: raise http exception (see fetcher) -- why? why not??
139
- fail "fetch HTTP - #{response.code} #{response.message}"
140
- nil
141
- end
142
- end
143
-
144
-
145
- =begin
146
- def self.from_url( cc, cn )
147
- html_ascii = PageFetcher.new.fetch( cc )
148
- self.new( cc, cn, html_ascii )
149
- end
150
-
151
- def self.from_file( cc, cn, opts={} )
152
- input_dir = opts[:input_dir] || '.'
153
- html_ascii = File.read( "#{input_dir}/#{cc}.html" ) ## fix/todo: use ASCII8BIT/binary reader
154
- self.new( cc, cn, html_ascii )
155
- end
156
- =end
157
-
158
-
159
- end # class Page
160
-
161
-
162
- =begin
163
- class PageFetcher
164
-
165
- def fetch( cc )
166
- worker = Fetcher::Worker.new
167
- factbook_base = 'https://www.cia.gov/library/publications/the-world-factbook/geos'
168
-
169
- res = worker.get_response( "#{factbook_base}/#{cc}.html" )
170
-
171
- # on error throw exception - why? why not??
172
- if res.code != '200'
173
- raise Fetcher::HttpError.new( res.code, res.message )
174
- end
175
-
176
- ###
177
- # Note: Net::HTTP will NOT set encoding UTF-8 etc.
178
- # will be set to ASCII-8BIT == BINARY == Encoding Unknown; Raw Bytes Here
179
- html = res.body.to_s
180
- end
181
- end # PageFetcher
182
- =end
183
-
184
-
185
- end # module Factbook
@@ -1,12 +0,0 @@
1
- # encoding: utf-8
2
-
3
- module Factbook
4
-
5
- PageInfo = Struct.new( :country_code,
6
- :country_name,
7
- :country_affiliation,
8
- :region_code,
9
- :region_name,
10
- :last_updated )
11
-
12
- end # module Factbook
@@ -1,51 +0,0 @@
1
- # encoding: utf-8
2
-
3
- module Factbook
4
-
5
-
6
- class JsonPageReader
7
- def initialize( json_dir )
8
- @json_dir = json_dir
9
- end
10
-
11
- def read_page( code )
12
- path = "#{@json_dir}/#{region_to_slug(code.region)}/#{code.code}.json"
13
-
14
- puts "reading #{code.code} #{code.name} (#{code.region}) [#{path}]..."
15
- json = File.read( path )
16
-
17
- ## todo/fix/quick hack: for now until we have a proper header/meta/info section in json
18
- # add some page info from code struct
19
-
20
- info = PageInfo.new
21
- info.country_code = code.code
22
- info.country_name = code.name
23
- info.region_name = code.region
24
-
25
- page = Page.new( code.code, json: json, info: info )
26
- page
27
- end
28
-
29
- def read_pages( codes, limit: nil )
30
- pages = []
31
- i=0
32
- codes.each do |code|
33
- next if limit && i > limit ## for debugging just process first x entries
34
-
35
- pages << read_page( code )
36
- end
37
- pages
38
- end
39
-
40
- private
41
- def region_to_slug( text )
42
- ## change and => n
43
- ## change & => n
44
- ## change all spaces to => -
45
- ## e.g. East & Southeast Asia => east-n-southeast-asia
46
- ## Central America and Caribbean => central-america-n-caribbean
47
- text.downcase.gsub('and', 'n').gsub( '&', 'n' ).gsub( ' ', '-' )
48
- end
49
- end ## JsonPageReader
50
-
51
- end # module Factbook
@@ -1,207 +0,0 @@
1
- # encoding: utf-8
2
-
3
- module Factbook
4
-
5
- class Sanitizer
6
- include LogUtils::Logging
7
- include Utils ## pulls in encode_utf8, ...
8
-
9
-
10
- def sanitize( html_ascii )
11
- ## todo: add option for (html source) encoding - why?? why not??
12
-
13
- ## note:
14
- ## returns 1) html profile withouth headers, footers, scripts,etc.
15
- ## 2) page (meta) info e.g. country_name, country_code, last_updated, etc.
16
- ## 3) errors e.g. list of errors e.g. endcoding errors (invalid byte sequence etc.)
17
-
18
- page_info = PageInfo.new
19
-
20
- h = find_page_info( html_ascii )
21
- page_info.country_code = h[:country_code]
22
- page_info.country_name = h[:country_name]
23
- page_info.country_affiliation = h[:country_affiliation]
24
- page_info.region_code = h[:region_code]
25
- page_info.region_name = h[:region_name]
26
-
27
- page_info.last_updated = find_page_last_updated( html_ascii )
28
-
29
-
30
- html_profile_ascii = find_country_profile( html_ascii ) ## cut-off headers, footers, scripts, etc.
31
-
32
- ## todo/fix: assume windows 12xx encoding!!!! for factbook - try
33
- html, errors = encode_utf8( html_profile_ascii ) ## change encoding to utf-8 (from binary/ascii8bit)
34
-
35
- html = sanitize_profile( html )
36
-
37
- [html, page_info, errors]
38
- end
39
-
40
-
41
-
42
- BEGIN_FACTS_REGEX = /<ul\s+
43
- class="expandcollapse">
44
- /xim ## ignore case; multi-line
45
-
46
- END_FACTS_REGEX = /<\/li>\s*
47
- <\/ul>\s*
48
- <\/tbody>\s*
49
- <\/table>
50
- /xim ## ignore case; multi-line
51
-
52
-
53
- def find_country_profile( html )
54
- ####
55
- ## remove header (everything before)
56
- ## <ul class="expandcollapse">
57
-
58
- pos = html.index( BEGIN_FACTS_REGEX )
59
- fail "*** no begin facts marker found for page" if pos.nil?
60
-
61
- puts " bingo - found BEGIN_FACTS on pos #{pos}"
62
- html = html[pos..-1]
63
-
64
- pp html[0..100]
65
-
66
- ###
67
- ## remove footer
68
- ## assume everthings after (last list item in unorder list inside a table body)
69
- ## </li>
70
- ## </ul>
71
- ## </tbody></table>
72
-
73
- pos = html.index( END_FACTS_REGEX )
74
- fail "*** no end facts marker found for page" if pos.nil?
75
-
76
- puts " bingo - found END_FACTS on pos #{pos}"
77
- html = html[0...pos] + "</li></ul>\n" ## note: use ... (not .. to cut-off pos)
78
-
79
- pp html[-200..-1]
80
- html
81
- end
82
-
83
-
84
-
85
- STYLE_ATTR_REGEX = /\s*
86
- style=('|").+?\1 ## note: use non-greedy match e.g. .+?
87
- /xim ## do NOT allow multi-line - why? why not?
88
-
89
- CLASS_ATTR_REGEX = /\s*
90
- class=('|")(.+?)\1 ## note: use non-greedy match e.g. .+?
91
- /xim ## do NOT allow multi-line - why? why not?
92
-
93
- ##
94
- ## <div>
95
- ## <span class='category'>country comparison to the world: </span>
96
- ## <span class='category_data'>[[191]]</span>
97
- ## </div>
98
- ##
99
- ## <span class='category'>country comparison to the world: </span>
100
- ## <span class='category_data'><a href='../rankorder/2147rank.html#au'>114</a></span>
101
-
102
-
103
- ## todo: add enclosing div too!!!
104
-
105
- COUNTRY_COMPARISON_REGEX = /
106
- <div>
107
- <span \s class='category'[^>]*>
108
- country \s comparison \s to \s the \s world: \s*
109
- <\/span>
110
- \s*
111
- <span \s class='category_data'[^>]*>
112
- \s*
113
- <a \s [^>]+>
114
- .+?
115
- <\/a>
116
- \s*
117
- <\/span>
118
- <\/div>
119
- /xim
120
-
121
- ##
122
- ## <div class='wrap'>
123
- ## <div class='audio-player'>
124
- ## <audio id='audio-player-1' class='my-audio-player' src='../anthems/AU.mp3' type='audio/mp3' controls='controls'>
125
- ## </audio>
126
- ## </div></div>
127
-
128
-
129
- AUDIO_PLAYER_REGEX = /
130
- <div \s class='wrap'>
131
- <div \s class='audio-player'>
132
- <audio \s [^>]+>
133
- <\/audio>
134
- <\/div>
135
- <\/div>
136
- /xim
137
-
138
- def sanitize_profile( html )
139
-
140
- html = html.gsub( STYLE_ATTR_REGEX ) do |m|
141
- puts "remove style attr:"
142
- puts "#{m}"
143
- ''
144
- end
145
-
146
- html = html.gsub( AUDIO_PLAYER_REGEX ) do |m|
147
- puts "remove audio player:"
148
- puts "#{m}"
149
- ''
150
- end
151
-
152
-
153
- html = html.gsub( COUNTRY_COMPARISON_REGEX ) do |m|
154
- puts "remove country comparison:"
155
- puts "#{m}"
156
- ''
157
- end
158
-
159
- ## remove/cleanup anchors (a href)
160
- html = html.gsub( /<a\s+href[^>]*>(.+?)<\/a>/im ) do |_| ## note: use .+? non-greedy match
161
- puts " replace anchor (a) href >#{$1}<"
162
-
163
- inner_text = $1.dup ## keep a copy
164
- if inner_text =~ /<img/ ## if includes image remove
165
- puts " remove image in anchor"
166
- ''
167
- else ## keep inner text
168
- inner_text
169
- end
170
- end
171
-
172
-
173
- ## remove all list e.g. ul/li
174
- html = html.gsub( /<\/?(li|ul)[^>]*>/im ) do |m|
175
- puts " remove list >#{m}<"
176
- ''
177
- end
178
-
179
- ## clean-up class attrib e.g. remove unknown classes
180
- html = html.gsub( CLASS_ATTR_REGEX ) do |m|
181
- puts "cleanup class attr:"
182
- puts "#{m}"
183
-
184
- klasses = $2.split(' ')
185
- klasses = klasses.select do |klass|
186
- if ['region', 'category', 'category_data'].include?( klass )
187
- true
188
- else
189
- puts " remove class #{klass}"
190
- false
191
- end
192
- end
193
-
194
- if klasses.size > 0
195
- " class='#{klasses.join(' ')}'" ## note: add leading space!!
196
- else
197
- '' ## remove class attrib completely
198
- end
199
- end
200
-
201
- html
202
- end
203
-
204
-
205
- end # class Sanitizer
206
-
207
- end # module Factbook