factbook 2.0.0 → 2.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +1 -1
- data/Manifest.txt +0 -61
- data/README.md +8 -506
- data/Rakefile +4 -9
- data/lib/factbook.rb +4 -64
- metadata +6 -124
- data/data/attributes.yml +0 -337
- data/data/categories.csv +0 -164
- data/data/codes.csv +0 -262
- data/data/codesxref.csv +0 -280
- data/data/comparisons.csv +0 -75
- data/lib/factbook/almanac.rb +0 -72
- data/lib/factbook/attributes.rb +0 -74
- data/lib/factbook/builder.rb +0 -212
- data/lib/factbook/builder_item.rb +0 -126
- data/lib/factbook/builder_json.rb +0 -79
- data/lib/factbook/codes.rb +0 -119
- data/lib/factbook/comparisons.rb +0 -50
- data/lib/factbook/counter.rb +0 -48
- data/lib/factbook/db/importer.rb +0 -92
- data/lib/factbook/db/models.rb +0 -11
- data/lib/factbook/db/schema.rb +0 -36
- data/lib/factbook/normalize.rb +0 -43
- data/lib/factbook/page.rb +0 -148
- data/lib/factbook/page_info.rb +0 -12
- data/lib/factbook/reader_json.rb +0 -51
- data/lib/factbook/sanitizer.rb +0 -178
- data/lib/factbook/sect.rb +0 -29
- data/lib/factbook/subsect.rb +0 -18
- data/lib/factbook/table.rb +0 -52
- data/lib/factbook/utils.rb +0 -85
- data/lib/factbook/utils_info.rb +0 -129
- data/lib/factbook/version.rb +0 -21
- data/script/almanac.rb +0 -48
- data/script/attributes.rb +0 -34
- data/script/build.rb +0 -28
- data/script/counter.rb +0 -145
- data/script/json.rb +0 -19
- data/script/testbr.rb +0 -33
- data/script/testcodes.rb +0 -11
- data/test/data/au.html +0 -579
- data/test/data/au.yml +0 -8
- data/test/data/be.html +0 -596
- data/test/data/be.yml +0 -8
- data/test/data/json/au.json +0 -892
- data/test/data/src/ag.html +0 -716
- data/test/data/src/au-2015-09-24.html +0 -2006
- data/test/data/src/au.html +0 -658
- data/test/data/src/be-2015-09-24.html +0 -2011
- data/test/data/src/be.html +0 -648
- data/test/helper.rb +0 -11
- data/test/test_attribs.rb +0 -87
- data/test/test_attribs_def.rb +0 -20
- data/test/test_builder.rb +0 -35
- data/test/test_codes.rb +0 -76
- data/test/test_comparisons.rb +0 -19
- data/test/test_convert.rb +0 -30
- data/test/test_counter.rb +0 -31
- data/test/test_fields.rb +0 -52
- data/test/test_importer.rb +0 -56
- data/test/test_item_builder.rb +0 -99
- data/test/test_json.rb +0 -45
- data/test/test_json_builder.rb +0 -25
- data/test/test_normalize.rb +0 -23
- data/test/test_page.rb +0 -38
- data/test/test_sanitizer.rb +0 -39
- data/test/test_sanitizer_regex.rb +0 -89
data/lib/factbook/page.rb
DELETED
@@ -1,148 +0,0 @@
|
|
1
|
-
|
2
|
-
module Factbook
|
3
|
-
|
4
|
-
|
5
|
-
## note:
|
6
|
-
## some factbook pages with chrome (headers, footers, etc.)
|
7
|
-
## are NOT valid utf-8, thus,
|
8
|
-
## treat page as is (e.g. ASCII8BIT)
|
9
|
-
#
|
10
|
-
# only convert to utf8 when header and footer got stripped
|
11
|
-
|
12
|
-
##
|
13
|
-
## be/benin:
|
14
|
-
## Key Force or FC [Lazare S?xx?HOU?xx?TO] -- two invalid byte code chars in Political parties and leaders:
|
15
|
-
#
|
16
|
-
## in Western/Windows-1252 leads to FC [Lazare SÈHOUÉTO];
|
17
|
-
# Lazare Sèhouéto
|
18
|
-
#
|
19
|
-
# looks good - use (assume) Windows-1252 ????
|
20
|
-
|
21
|
-
##
|
22
|
-
# check for is ascii 7-bit ??? if yes -noworries
|
23
|
-
# if not, log number of chars not using ascii 7-bit
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
class Page
|
28
|
-
include LogUtils::Logging
|
29
|
-
|
30
|
-
attr_reader :sects ## "structured" access e.g. sects/subsects/etc.
|
31
|
-
attr_reader :info ## meta info e.g. country_code, country_name, region_name, last_updated, etc.
|
32
|
-
attr_reader :data ## "plain" access with vanilla hash
|
33
|
-
|
34
|
-
|
35
|
-
## standard version (note: requires https)
|
36
|
-
SITE_BASE = 'https://www.cia.gov/library/publications/the-world-factbook/geos/{code}.html'
|
37
|
-
|
38
|
-
def initialize( code, opts={} )
|
39
|
-
### keep code - why? why not?? (use page_info/info e.g. info.country_code??)
|
40
|
-
|
41
|
-
if opts[:json]
|
42
|
-
json = opts[:json] ## note: json is (still) a string/text (NOT yet parsed to structured data)
|
43
|
-
b = JsonBuilder.from_string( json )
|
44
|
-
else ## assume html
|
45
|
-
if opts[:html] ## note: expects ASCII-7BIT/BINARY encoding
|
46
|
-
## for debugging and testing allow "custom" passed-in html page
|
47
|
-
html = opts[:html]
|
48
|
-
else
|
49
|
-
url_string = SITE_BASE.gsub( '{code}', code )
|
50
|
-
## note: expects ASCII-7BIT/BINARY encoding
|
51
|
-
|
52
|
-
## html = fetch_page( url_string ) ## use PageFetcher class - why?? why not??
|
53
|
-
html = Webcache.read( url_string )
|
54
|
-
end
|
55
|
-
b = Builder.from_string( html )
|
56
|
-
end
|
57
|
-
|
58
|
-
@sects = b.sects
|
59
|
-
@info = b.info
|
60
|
-
|
61
|
-
## todo/fix/quick hack:
|
62
|
-
## check for info opts hash entry - lets you overwrite page info
|
63
|
-
## -- use proper header to setup page info - why, why not??
|
64
|
-
if opts[:info]
|
65
|
-
info = opts[:info]
|
66
|
-
@info = info
|
67
|
-
end
|
68
|
-
|
69
|
-
@data = {}
|
70
|
-
@sects.each do |sect|
|
71
|
-
@data[ sect.title ] = sect.data
|
72
|
-
end
|
73
|
-
|
74
|
-
self ## return self (check - not needed??)
|
75
|
-
end
|
76
|
-
|
77
|
-
|
78
|
-
def to_json( opts={} ) ## convenience helper for data.to_json; note: pretty print by default!
|
79
|
-
if opts[:minify]
|
80
|
-
data.to_json
|
81
|
-
else
|
82
|
-
## was: -- opts[:pretty] || opts[:pp]
|
83
|
-
JSON.pretty_generate( data ) ## note: pretty print by default!
|
84
|
-
end
|
85
|
-
end
|
86
|
-
|
87
|
-
|
88
|
-
def [](key) ### convenience shortcut
|
89
|
-
# lets you use
|
90
|
-
# page['geo']
|
91
|
-
# instead of
|
92
|
-
# page.data['geo']
|
93
|
-
|
94
|
-
## fix: use delegate data, [] from forwardable lib - why?? why not??
|
95
|
-
|
96
|
-
data[key]
|
97
|
-
end
|
98
|
-
|
99
|
-
## add convenience (shortcut) accessors / attributes / fields / getters
|
100
|
-
|
101
|
-
ATTRIBUTES.each do |attrib|
|
102
|
-
## e.g.
|
103
|
-
## def background() data['Introduction']['Background']['text']; end
|
104
|
-
## def location() data['Geography']['Location']['text']; end
|
105
|
-
## etc.
|
106
|
-
if attrib.path.size == 1
|
107
|
-
define_method attrib.name.to_sym do
|
108
|
-
@data.fetch( attrib.category, {} ).
|
109
|
-
fetch( attrib.path[0], {} )['text']
|
110
|
-
end
|
111
|
-
else ## assume size 2 for now
|
112
|
-
define_method attrib.name.to_sym do
|
113
|
-
@data.fetch( attrib.category, {} ).
|
114
|
-
fetch( attrib.path[0], {} ).
|
115
|
-
fetch( attrib.path[1], {} )['text']
|
116
|
-
end
|
117
|
-
end
|
118
|
-
end
|
119
|
-
|
120
|
-
|
121
|
-
private
|
122
|
-
def fetch_page( url )
|
123
|
-
response = Webget.page( url )
|
124
|
-
|
125
|
-
## note: exit on get / fetch error - do NOT continue for now - why? why not?
|
126
|
-
exit 1 if response.status.nok? ## e.g. HTTP status code != 200
|
127
|
-
|
128
|
-
|
129
|
-
response.text
|
130
|
-
end
|
131
|
-
|
132
|
-
|
133
|
-
=begin
|
134
|
-
def self.from_url( cc, cn )
|
135
|
-
html_ascii = PageFetcher.new.fetch( cc )
|
136
|
-
self.new( cc, cn, html_ascii )
|
137
|
-
end
|
138
|
-
|
139
|
-
def self.from_file( cc, cn, opts={} )
|
140
|
-
input_dir = opts[:input_dir] || '.'
|
141
|
-
html_ascii = File.read( "#{input_dir}/#{cc}.html" ) ## fix/todo: use ASCII8BIT/binary reader
|
142
|
-
self.new( cc, cn, html_ascii )
|
143
|
-
end
|
144
|
-
=end
|
145
|
-
|
146
|
-
|
147
|
-
end # class Page
|
148
|
-
end # module Factbook
|
data/lib/factbook/page_info.rb
DELETED
data/lib/factbook/reader_json.rb
DELETED
@@ -1,51 +0,0 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
|
-
module Factbook
|
4
|
-
|
5
|
-
|
6
|
-
class JsonPageReader
|
7
|
-
def initialize( json_dir )
|
8
|
-
@json_dir = json_dir
|
9
|
-
end
|
10
|
-
|
11
|
-
def read_page( code )
|
12
|
-
path = "#{@json_dir}/#{region_to_slug(code.region)}/#{code.code}.json"
|
13
|
-
|
14
|
-
puts "reading #{code.code} #{code.name} (#{code.region}) [#{path}]..."
|
15
|
-
json = File.read( path )
|
16
|
-
|
17
|
-
## todo/fix/quick hack: for now until we have a proper header/meta/info section in json
|
18
|
-
# add some page info from code struct
|
19
|
-
|
20
|
-
info = PageInfo.new
|
21
|
-
info.country_code = code.code
|
22
|
-
info.country_name = code.name
|
23
|
-
info.region_name = code.region
|
24
|
-
|
25
|
-
page = Page.new( code.code, json: json, info: info )
|
26
|
-
page
|
27
|
-
end
|
28
|
-
|
29
|
-
def read_pages( codes, limit: nil )
|
30
|
-
pages = []
|
31
|
-
i=0
|
32
|
-
codes.each do |code|
|
33
|
-
next if limit && i > limit ## for debugging just process first x entries
|
34
|
-
|
35
|
-
pages << read_page( code )
|
36
|
-
end
|
37
|
-
pages
|
38
|
-
end
|
39
|
-
|
40
|
-
private
|
41
|
-
def region_to_slug( text )
|
42
|
-
## change and => n
|
43
|
-
## change & => n
|
44
|
-
## change all spaces to => -
|
45
|
-
## e.g. East & Southeast Asia => east-n-southeast-asia
|
46
|
-
## Central America and Caribbean => central-america-n-caribbean
|
47
|
-
text.downcase.gsub('and', 'n').gsub( '&', 'n' ).gsub( ' ', '-' )
|
48
|
-
end
|
49
|
-
end ## JsonPageReader
|
50
|
-
|
51
|
-
end # module Factbook
|
data/lib/factbook/sanitizer.rb
DELETED
@@ -1,178 +0,0 @@
|
|
1
|
-
|
2
|
-
module Factbook
|
3
|
-
|
4
|
-
class Sanitizer
|
5
|
-
include LogUtils::Logging
|
6
|
-
include Utils ## pulls in encode_utf8, ...
|
7
|
-
|
8
|
-
|
9
|
-
def sanitize( html_ascii )
|
10
|
-
## todo: add option for (html source) encoding - why?? why not??
|
11
|
-
|
12
|
-
## note:
|
13
|
-
## returns 1) html profile withouth headers, footers, scripts,etc.
|
14
|
-
## 2) page (meta) info e.g. country_name, country_code, last_updated, etc.
|
15
|
-
## 3) errors e.g. list of errors e.g. endcoding errors (invalid byte sequence etc.)
|
16
|
-
|
17
|
-
page_info = PageInfo.new
|
18
|
-
|
19
|
-
## todo:
|
20
|
-
## make page info optional? why? why not?
|
21
|
-
## not always available (if page structure changes) - check
|
22
|
-
## what page info is required??
|
23
|
-
h = find_page_info( html_ascii )
|
24
|
-
if h
|
25
|
-
page_info.country_code = h[:country_code]
|
26
|
-
page_info.country_name = h[:country_name]
|
27
|
-
page_info.country_affiliation = h[:country_affiliation]
|
28
|
-
page_info.region_code = h[:region_code]
|
29
|
-
page_info.region_name = h[:region_name]
|
30
|
-
else
|
31
|
-
page_info.country_code = find_country_code( html_ascii )
|
32
|
-
## print/warn: no page info found
|
33
|
-
end
|
34
|
-
|
35
|
-
|
36
|
-
page_info.last_updated = find_page_last_updated( html_ascii )
|
37
|
-
|
38
|
-
|
39
|
-
html = find_country_profile( html_ascii ) ## cut-off headers, footers, scripts, etc.
|
40
|
-
|
41
|
-
## todo/fix: assume windows 12xx encoding!!!! for factbook - try
|
42
|
-
# html, errors = encode_utf8( html_profile_ascii ) ## change encoding to utf-8 (from binary/ascii8bit)
|
43
|
-
|
44
|
-
# html = sanitize_profile( html )
|
45
|
-
|
46
|
-
[html, page_info, []]
|
47
|
-
end
|
48
|
-
|
49
|
-
|
50
|
-
#
|
51
|
-
# <span class="subfield-date" aria-label="Date of information: 2018">(2018)</span>
|
52
|
-
#
|
53
|
-
# remove aria labels
|
54
|
-
ARIA_ATTR_REGEX = /\s*
|
55
|
-
aria-label=('|").+?\1 ## note: use non-greedy match e.g. .+?
|
56
|
-
/xim ## do NOT allow multi-line - why? why not?
|
57
|
-
|
58
|
-
|
59
|
-
def find_country_profile( html )
|
60
|
-
####
|
61
|
-
## remove header (everything before)
|
62
|
-
## <ul class="expandcollapse">
|
63
|
-
|
64
|
-
doc = Nokogiri::HTML( html )
|
65
|
-
|
66
|
-
ul = doc.css( 'ul.expandcollapse' )[0]
|
67
|
-
|
68
|
-
puts ul.to_html[0..100]
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
## note: special case cc uses h2 instead of div block
|
73
|
-
## <h2 class="question cam_med" sectiontitle="Introduction" ccode="cc"
|
74
|
-
## style="border-bottom: 2px solid white; cursor: pointer;">
|
75
|
-
## Introduction :: <span class="region">CURACAO </span>
|
76
|
-
## </h2>
|
77
|
-
## is old format !!!!
|
78
|
-
## cc - CURACAO
|
79
|
-
## http headers says - last-modified: Wed, 14 Nov 2018 14:09:28 GMT
|
80
|
-
## page says - PAGE LAST UPDATED ON MARCH 14, 2018
|
81
|
-
## wait for new version to be generated / pushed!!!
|
82
|
-
|
83
|
-
## check for old format if h2 are present
|
84
|
-
h2s = ul.css( 'h2' )
|
85
|
-
if h2s.size > 0
|
86
|
-
puts " !! WARN: found #{h2s.size} h2(s) - assume old format - sorry - must wait for update!!!"
|
87
|
-
## return empty html string - why? why not?
|
88
|
-
return ''
|
89
|
-
end
|
90
|
-
|
91
|
-
|
92
|
-
###
|
93
|
-
## sanitize
|
94
|
-
|
95
|
-
## remove link items
|
96
|
-
## assume two <li>s are a section
|
97
|
-
|
98
|
-
html = String.new('')
|
99
|
-
|
100
|
-
## filter all li's
|
101
|
-
ul_children = ul.children.select { |el| if el.name == 'li'
|
102
|
-
true
|
103
|
-
else
|
104
|
-
# puts "skipping #{el.name} >#{el.to_html}<"
|
105
|
-
false
|
106
|
-
end
|
107
|
-
}
|
108
|
-
puts " #{ul_children.size} li(s):"
|
109
|
-
ul_children.each_slice(2) do |lis|
|
110
|
-
li = lis[0]
|
111
|
-
div = li.at( 'div[sectiontitle]' )
|
112
|
-
if div.nil?
|
113
|
-
puts "!! ERROR: no section title found in div:"
|
114
|
-
puts li.to_html
|
115
|
-
exit 1
|
116
|
-
end
|
117
|
-
|
118
|
-
section_title = div['sectiontitle'].to_s
|
119
|
-
|
120
|
-
html << "<h2>#{section_title}</h2>\n"
|
121
|
-
|
122
|
-
|
123
|
-
li = lis[1]
|
124
|
-
## filter all div's
|
125
|
-
li_children = li.children.select { |el| if el.name =='div'
|
126
|
-
true
|
127
|
-
else
|
128
|
-
# puts "skipping #{el.name} >#{el.to_html}<"
|
129
|
-
false
|
130
|
-
end
|
131
|
-
}
|
132
|
-
puts " #{li_children.size} div(s):"
|
133
|
-
|
134
|
-
li_children.each_slice(2) do |divs|
|
135
|
-
div = divs[0]
|
136
|
-
a = div.css('a')[0]
|
137
|
-
|
138
|
-
if a
|
139
|
-
html << "\n<h3>#{a.text}:</h3>\n"
|
140
|
-
else
|
141
|
-
puts "!! WARN: no anchor found:"
|
142
|
-
puts div.to_html
|
143
|
-
end
|
144
|
-
|
145
|
-
|
146
|
-
div = divs[1]
|
147
|
-
div_children = div.children.select {|el| el.name == 'div' ? true : false }
|
148
|
-
div_children.each do |catdiv|
|
149
|
-
if catdiv['class'] && catdiv['class'].index( 'category_data' )
|
150
|
-
|
151
|
-
if catdiv['class'].index( 'attachment' )
|
152
|
-
## skip attachments e.g. maps, pop pyramids, etc.
|
153
|
-
else
|
154
|
-
html << catdiv.to_html
|
155
|
-
html << "\n"
|
156
|
-
end
|
157
|
-
else
|
158
|
-
puts "!! WARN: skipping div (W/O category_data class):"
|
159
|
-
puts catdiv.to_html
|
160
|
-
end
|
161
|
-
end
|
162
|
-
end
|
163
|
-
end
|
164
|
-
|
165
|
-
|
166
|
-
html = html.gsub( ARIA_ATTR_REGEX ) do |m|
|
167
|
-
puts "remove aria-label attr:"
|
168
|
-
puts "#{m}"
|
169
|
-
''
|
170
|
-
end
|
171
|
-
|
172
|
-
html
|
173
|
-
end
|
174
|
-
|
175
|
-
|
176
|
-
end # class Sanitizer
|
177
|
-
|
178
|
-
end # module Factbook
|
data/lib/factbook/sect.rb
DELETED
@@ -1,29 +0,0 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
|
-
module Factbook
|
4
|
-
|
5
|
-
|
6
|
-
class Sect
|
7
|
-
include LogUtils::Logging
|
8
|
-
|
9
|
-
attr_accessor :title ## use name instead of title - why? why not?
|
10
|
-
attr_accessor :subsects
|
11
|
-
|
12
|
-
def initialize
|
13
|
-
@subsects = []
|
14
|
-
end
|
15
|
-
|
16
|
-
def data
|
17
|
-
## convert sects to hash
|
18
|
-
@data = {}
|
19
|
-
|
20
|
-
subsects.each_with_index do |subsect,i|
|
21
|
-
@data[ subsect.title ] = subsect.data
|
22
|
-
end
|
23
|
-
@data
|
24
|
-
end
|
25
|
-
|
26
|
-
|
27
|
-
end # class Sect
|
28
|
-
|
29
|
-
end # module Factbook
|
data/lib/factbook/subsect.rb
DELETED
@@ -1,18 +0,0 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
|
-
module Factbook
|
4
|
-
|
5
|
-
|
6
|
-
class Subsect
|
7
|
-
include LogUtils::Logging
|
8
|
-
|
9
|
-
attr_accessor :title ## use name instead of title - why? why not?
|
10
|
-
attr_accessor :data ## hash holding data e.g. { 'text' => '...' etc. }
|
11
|
-
|
12
|
-
def initialize
|
13
|
-
@data = {}
|
14
|
-
end
|
15
|
-
|
16
|
-
end # class Subsect
|
17
|
-
|
18
|
-
end # module Factbook
|
data/lib/factbook/table.rb
DELETED
@@ -1,52 +0,0 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
|
-
module Factbook
|
4
|
-
|
5
|
-
##
|
6
|
-
## make more "generic" - why? why not?
|
7
|
-
## (re)use for other files ?? move to textutils ??
|
8
|
-
|
9
|
-
##
|
10
|
-
## for now reads in rows with values separated by at least 3+ spaces e.g.:
|
11
|
-
## see www.cia.gov/library/publications/the-world-factbook/rankorder/rawdata_2119.txt
|
12
|
-
## 1 China 1,367,485,388
|
13
|
-
## 2 India 1,251,695,584
|
14
|
-
## 3 European Union 513,949,445
|
15
|
-
## 4 United States 321,368,864
|
16
|
-
## 5 Indonesia 255,993,674
|
17
|
-
## 6 Brazil 204,259,812
|
18
|
-
|
19
|
-
|
20
|
-
class TableReader
|
21
|
-
include LogUtils::Logging
|
22
|
-
|
23
|
-
|
24
|
-
def initialize( text )
|
25
|
-
@text = text
|
26
|
-
end
|
27
|
-
|
28
|
-
def read
|
29
|
-
recs = []
|
30
|
-
|
31
|
-
line_no = 0
|
32
|
-
@text.each_line do |line|
|
33
|
-
line_no +=1
|
34
|
-
line = line.strip ## remove leading and trailing whitespace
|
35
|
-
if line.empty?
|
36
|
-
puts "** skipping empty line #{line_no}"
|
37
|
-
next
|
38
|
-
end
|
39
|
-
|
40
|
-
values = line.split( /[ ]{3,}/ ) ## split three or more spaces - use just two ?? why? why not??
|
41
|
-
|
42
|
-
## puts line
|
43
|
-
## pp values
|
44
|
-
recs << values
|
45
|
-
end
|
46
|
-
recs
|
47
|
-
end
|
48
|
-
|
49
|
-
|
50
|
-
end # class TableReader
|
51
|
-
|
52
|
-
end # module Factbook
|