factbook-readers 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/CHANGELOG.md +3 -0
- data/Manifest.txt +56 -0
- data/README.md +196 -0
- data/Rakefile +34 -0
- data/data/attributes.yml +337 -0
- data/data/categories.csv +164 -0
- data/data/codes.csv +262 -0
- data/data/codesxref.csv +280 -0
- data/data/comparisons.csv +75 -0
- data/lib/factbook-readers.rb +59 -0
- data/lib/factbook-readers/attributes.rb +74 -0
- data/lib/factbook-readers/builder.rb +212 -0
- data/lib/factbook-readers/builder_item.rb +185 -0
- data/lib/factbook-readers/builder_json.rb +79 -0
- data/lib/factbook-readers/codes.rb +122 -0
- data/lib/factbook-readers/comparisons.rb +50 -0
- data/lib/factbook-readers/counter.rb +48 -0
- data/lib/factbook-readers/normalize.rb +43 -0
- data/lib/factbook-readers/page.rb +148 -0
- data/lib/factbook-readers/page_info.rb +12 -0
- data/lib/factbook-readers/reader_json.rb +51 -0
- data/lib/factbook-readers/sanitizer.rb +307 -0
- data/lib/factbook-readers/sect.rb +29 -0
- data/lib/factbook-readers/subsect.rb +18 -0
- data/lib/factbook-readers/table.rb +52 -0
- data/lib/factbook-readers/utils.rb +47 -0
- data/lib/factbook-readers/utils_info.rb +129 -0
- data/lib/factbook-readers/version.rb +24 -0
- data/lib/factbook/readers.rb +5 -0
- data/test/data/au.html +579 -0
- data/test/data/au.yml +8 -0
- data/test/data/be.html +596 -0
- data/test/data/be.yml +8 -0
- data/test/data/json/au.json +892 -0
- data/test/data/src/ag.html +716 -0
- data/test/data/src/au-2015-09-24.html +2006 -0
- data/test/data/src/au.html +658 -0
- data/test/data/src/be-2015-09-24.html +2011 -0
- data/test/data/src/be.html +648 -0
- data/test/helper.rb +11 -0
- data/test/test_attribs.rb +87 -0
- data/test/test_attribs_def.rb +20 -0
- data/test/test_builder.rb +35 -0
- data/test/test_codes.rb +76 -0
- data/test/test_comparisons.rb +19 -0
- data/test/test_convert.rb +30 -0
- data/test/test_counter.rb +31 -0
- data/test/test_fields.rb +52 -0
- data/test/test_importer.rb +56 -0
- data/test/test_item_builder.rb +99 -0
- data/test/test_json.rb +45 -0
- data/test/test_json_builder.rb +25 -0
- data/test/test_normalize.rb +23 -0
- data/test/test_page.rb +38 -0
- data/test/test_sanitizer.rb +39 -0
- data/test/test_sanitizer_regex.rb +89 -0
- metadata +196 -0
@@ -0,0 +1,79 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
module Factbook
|
4
|
+
|
5
|
+
######
|
6
|
+
# json builder -- lets us rebuild a page from "dumped" json (instead of parsing html page)
|
7
|
+
|
8
|
+
class JsonBuilder
|
9
|
+
include LogUtils::Logging
|
10
|
+
include NormalizeHelper ## e.g. normalize_category
|
11
|
+
|
12
|
+
|
13
|
+
def self.from_file( path )
|
14
|
+
text = File.read( path ) ## fix: use File.read_utf8 from textutils
|
15
|
+
self.from_string( text )
|
16
|
+
end
|
17
|
+
|
18
|
+
def self.from_string( text )
|
19
|
+
self.new( text )
|
20
|
+
end
|
21
|
+
|
22
|
+
|
23
|
+
attr_reader :text,
|
24
|
+
:json,
|
25
|
+
:info, ## not used yet -- page info incl. country_name, region_name, last_updated etc.
|
26
|
+
:errors, ## not used yet -- encoding erros etc.
|
27
|
+
:sects
|
28
|
+
|
29
|
+
|
30
|
+
def initialize( text )
|
31
|
+
@text = text
|
32
|
+
|
33
|
+
@json = JSON.parse( text )
|
34
|
+
|
35
|
+
@info = nil ## fix/todo: sorry - for now no page info (use header in json - why? why not??)
|
36
|
+
@errors = [] ## fix/todo: sorry - for now no errors possible/tracked
|
37
|
+
|
38
|
+
@sects = []
|
39
|
+
|
40
|
+
@json.each do |k1,v1|
|
41
|
+
sect_title = k1
|
42
|
+
sect_subsects = v1
|
43
|
+
|
44
|
+
sect = Sect.new
|
45
|
+
sect.title = sect_title
|
46
|
+
|
47
|
+
## get subsections
|
48
|
+
subsects = []
|
49
|
+
sect_subsects.each do |k2,v2|
|
50
|
+
subsect_title = k2
|
51
|
+
subsect_data = v2
|
52
|
+
|
53
|
+
subsect = Subsect.new
|
54
|
+
subsect.title = subsect_title
|
55
|
+
|
56
|
+
#####
|
57
|
+
## note: run data hash through normalize_category (again)
|
58
|
+
if subsect_data.is_a?( Hash )
|
59
|
+
new_subsect_data = {}
|
60
|
+
subsect_data.each do |k3,v3|
|
61
|
+
new_subsect_data[ normalize_category(k3) ] = v3
|
62
|
+
end
|
63
|
+
subsect_data = new_subsect_data
|
64
|
+
end
|
65
|
+
|
66
|
+
subsect.data = subsect_data
|
67
|
+
|
68
|
+
subsects << subsect
|
69
|
+
end
|
70
|
+
|
71
|
+
sect.subsects = subsects
|
72
|
+
@sects << sect
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
end # class JsonBuilder
|
77
|
+
|
78
|
+
|
79
|
+
end # module Factbook
|
@@ -0,0 +1,122 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
##
|
4
|
+
# note:
|
5
|
+
# the factbook category/region for world is other entities (on FAQ) and oceans in page
|
6
|
+
# changed to world
|
7
|
+
|
8
|
+
|
9
|
+
module Factbook
|
10
|
+
|
11
|
+
class Codes
|
12
|
+
|
13
|
+
Code = Struct.new( :code, ## todo: add notes (country affiliation) - why? why not??
|
14
|
+
:name,
|
15
|
+
:category, ## e.g. Countries, Other, Oceans, World, Dependencies, etc.
|
16
|
+
:region, ## e.g. Europe, Oceans, etc.
|
17
|
+
)
|
18
|
+
|
19
|
+
def self.from_csv( path )
|
20
|
+
###
|
21
|
+
# note:
|
22
|
+
# if you use quotes - NO leading spaces allowed e.g.
|
23
|
+
# use au,"Austria",... and NOT
|
24
|
+
# au, "Austria", ...
|
25
|
+
#
|
26
|
+
# for headers - NO leading spaces allowed e.g.
|
27
|
+
# use Code,Name,Category,Region,... and NOT
|
28
|
+
# Code, Name, Category, Region, ...
|
29
|
+
|
30
|
+
rows = CsvHash.read( path )
|
31
|
+
|
32
|
+
pp rows
|
33
|
+
|
34
|
+
recs = []
|
35
|
+
rows.each do |row|
|
36
|
+
pp row
|
37
|
+
rec = Code.new
|
38
|
+
rec.code = row['Code'].strip ## remove leading n trailing whitespaces
|
39
|
+
rec.name = row['Name'].strip
|
40
|
+
|
41
|
+
## note: for now category and region are optional
|
42
|
+
rec.category = row['Category'].strip if row['Category'] && row['Category'].size > 0
|
43
|
+
rec.region = row['Region'].strip if row['Region'] && row['Region'].size > 0
|
44
|
+
|
45
|
+
pp rec
|
46
|
+
recs << rec
|
47
|
+
end
|
48
|
+
|
49
|
+
self.new( recs )
|
50
|
+
end
|
51
|
+
|
52
|
+
def initialize( codes )
|
53
|
+
@codes = codes
|
54
|
+
end
|
55
|
+
|
56
|
+
def size() @codes.size; end
|
57
|
+
|
58
|
+
def each( &blk ) @codes.each( &blk ); end
|
59
|
+
def select( &blk )
|
60
|
+
codes = @codes.select( &blk )
|
61
|
+
Codes.new( codes ) ## return (again) new Codes obj for easy-chaining - why? why not?
|
62
|
+
end
|
63
|
+
|
64
|
+
|
65
|
+
def to_a
|
66
|
+
@codes.collect {|code| code.code } ## return array of codes
|
67
|
+
end
|
68
|
+
|
69
|
+
## def all() self.to_a; end ## note: alias for to_a - use - why? why not??
|
70
|
+
|
71
|
+
## "pre-defined" convenience shortcuts
|
72
|
+
def countries() category 'Countries'; end
|
73
|
+
def world() category 'World'; end
|
74
|
+
def oceans() category 'Oceans'; end
|
75
|
+
def misc() category 'Miscellaneous'; end
|
76
|
+
def others() category 'Other'; end
|
77
|
+
def dependencies() category 'Dependencies'; end
|
78
|
+
def dependencies_us() category 'Dependencies (United States)'; end
|
79
|
+
## fix/todo: add all dependencies uk (or gb?), fr,cn,au,nz,no,dk,etc.
|
80
|
+
|
81
|
+
def europe() region 'Europe'; end
|
82
|
+
def south_asia() region 'South Asia'; end
|
83
|
+
def central_asia() region 'Central Asia'; end
|
84
|
+
def east_n_souteast_asia() region 'East & Southeast Asia'; end
|
85
|
+
def middle_east() region 'Middle East'; end
|
86
|
+
def africa() region 'Africa'; end
|
87
|
+
def north_america() region 'North America'; end
|
88
|
+
def central_america_n_caribbean() region 'Central America and Caribbean'; end
|
89
|
+
def south_america() region 'South America'; end
|
90
|
+
def australia_oceania() region 'Australia-Oceania'; end
|
91
|
+
def antartica() region 'Antarctica'; end
|
92
|
+
|
93
|
+
## note: regions oceans and world - same as category oceans and world
|
94
|
+
## use oceans_ii or world_ii or something ??
|
95
|
+
## use category('World') n region('World')
|
96
|
+
## use category('Oceans') n region('Oceans')
|
97
|
+
|
98
|
+
|
99
|
+
def category( query )
|
100
|
+
## todo/future: allow passing in of regex too (not just string)
|
101
|
+
## note: e.g. Dependencies (France) needs to get escpaed to
|
102
|
+
## Dependencies \(France\) etc.
|
103
|
+
filter_regex = /#{Regexp.escape(query)}/i
|
104
|
+
codes = @codes.select do |code|
|
105
|
+
code.category ? filter_regex.match( code.category ) : false ## note: allow nil for category; will fail on search
|
106
|
+
end
|
107
|
+
Codes.new( codes ) ## return new Codes obj for easy-chaining
|
108
|
+
end
|
109
|
+
|
110
|
+
def region( query )
|
111
|
+
## todo/future: allow passing in of regex too (not just string)
|
112
|
+
filter_regex = /#{Regexp.escape(query)}/i
|
113
|
+
codes = @codes.select do |code|
|
114
|
+
code.region ? filter_regex.match( code.region ) : false ## note: allow nil for region; will fail on search
|
115
|
+
end
|
116
|
+
Codes.new( codes ) ## return new Codes obj for easy-chaining
|
117
|
+
end
|
118
|
+
|
119
|
+
end # class codes
|
120
|
+
|
121
|
+
end # module Factbook
|
122
|
+
|
@@ -0,0 +1,50 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
module Factbook
|
4
|
+
|
5
|
+
class Comparisons
|
6
|
+
|
7
|
+
Comparison = Struct.new( :num, ### todo: use no or id or something - why? why not?
|
8
|
+
:category, ## e.g. Geography, People, Economy, etc.
|
9
|
+
:name,
|
10
|
+
)
|
11
|
+
|
12
|
+
def self.from_csv( path )
|
13
|
+
|
14
|
+
rows = CsvHash.read( path )
|
15
|
+
|
16
|
+
pp rows
|
17
|
+
|
18
|
+
recs = []
|
19
|
+
rows.each do |row|
|
20
|
+
pp row
|
21
|
+
rec = Comparison.new
|
22
|
+
rec.num = row['Num'].strip.to_i ## remove leading n trailing whitespaces
|
23
|
+
rec.category = row['Category'].strip
|
24
|
+
rec.name = row['Name'].strip
|
25
|
+
|
26
|
+
pp rec
|
27
|
+
recs << rec
|
28
|
+
end
|
29
|
+
|
30
|
+
self.new( recs )
|
31
|
+
end
|
32
|
+
|
33
|
+
def initialize( comps )
|
34
|
+
@comps = comps
|
35
|
+
end
|
36
|
+
|
37
|
+
def size() @comps.size; end
|
38
|
+
|
39
|
+
def each
|
40
|
+
@comps.each {|comp| yield( comp ) }
|
41
|
+
end
|
42
|
+
|
43
|
+
def to_a
|
44
|
+
@comps.collect {|comp| comp.num } ## return array of nums -- return something else - why? why not?
|
45
|
+
end
|
46
|
+
|
47
|
+
end # class Comparison
|
48
|
+
|
49
|
+
end # module Factbook
|
50
|
+
|
@@ -0,0 +1,48 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
module Factbook
|
4
|
+
|
5
|
+
class Counter
|
6
|
+
|
7
|
+
attr_reader :data
|
8
|
+
|
9
|
+
def initialize
|
10
|
+
@data = {}
|
11
|
+
end
|
12
|
+
|
13
|
+
def count( page )
|
14
|
+
|
15
|
+
## walk page data hash
|
16
|
+
# add nodes to data
|
17
|
+
|
18
|
+
walk( page, page.data, @data )
|
19
|
+
end
|
20
|
+
|
21
|
+
|
22
|
+
private
|
23
|
+
def walk( page, hin, hout )
|
24
|
+
hin.each do |k,v|
|
25
|
+
if v.is_a? Hash
|
26
|
+
hout2 = hout[k] || { count: 0, codes: '' }
|
27
|
+
|
28
|
+
hout2[ :count ] += 1
|
29
|
+
|
30
|
+
## delete codes if larger (treshhold) than x (e.g. 9)
|
31
|
+
hout2.delete( :codes ) if hout2[ :count ] > 9
|
32
|
+
|
33
|
+
codes = hout2[ :codes ]
|
34
|
+
if codes ## note: might got deleted if passed treshhold (e.g. 9 entries)
|
35
|
+
codes << ' ' unless codes.empty? ## add separator (space for now)
|
36
|
+
codes << page.info.country_code
|
37
|
+
hout2[ :codes ] = codes
|
38
|
+
end
|
39
|
+
|
40
|
+
hout[k] = hout2
|
41
|
+
walk( page, v, hout2 )
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
end # class Counter
|
47
|
+
|
48
|
+
end # module Factbook
|
@@ -0,0 +1,43 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
module Factbook
|
4
|
+
module NormalizeHelper
|
5
|
+
|
6
|
+
|
7
|
+
def normalize_category( text )
|
8
|
+
|
9
|
+
## note: fix typos/errors with double colons e.g. note:: (instead of note:)
|
10
|
+
|
11
|
+
text = text.strip
|
12
|
+
text = text.sub( /:+\z/, '' ) # remove trailing : if present -- note: allow (fix) note:: too, thus, use :+
|
13
|
+
text = text.strip
|
14
|
+
|
15
|
+
#######################################
|
16
|
+
### special cases
|
17
|
+
|
18
|
+
## typos e.g ntoe => use note
|
19
|
+
text = 'note' if text == 'ntoe'
|
20
|
+
text = 'investment in fixed capital' if text == 'investment if fixed capital'
|
21
|
+
|
22
|
+
## downcase
|
23
|
+
text = 'lowest point' if text == 'Lowest point'
|
24
|
+
text = 'chief of state' if text == 'Chief of state'
|
25
|
+
|
26
|
+
## spelling variant (use more popular one)
|
27
|
+
text = 'signed, but not ratified' if text == 'signed but not ratified'
|
28
|
+
text = 'vectorborne diseases' if text == 'vectorborne disease'
|
29
|
+
text = 'water contact disease' if text == 'water contact diseases'
|
30
|
+
text = 'food or waterborne diseases' if text == 'food or waterborne disease'
|
31
|
+
text = 'geographic coordinates' if text == 'geographical coordinates'
|
32
|
+
text = 'note' if text == 'notes'
|
33
|
+
text = 'refugees (country of origin)' if text == 'refugees (countries of origin)'
|
34
|
+
|
35
|
+
## border countries (8): -- remove (x) counter
|
36
|
+
text = 'border countries' if text.start_with?( 'border countries')
|
37
|
+
|
38
|
+
text
|
39
|
+
end
|
40
|
+
|
41
|
+
|
42
|
+
end # module NormalizeHelper
|
43
|
+
end # module Factbook
|
@@ -0,0 +1,148 @@
|
|
1
|
+
|
2
|
+
module Factbook
|
3
|
+
|
4
|
+
|
5
|
+
## note:
|
6
|
+
## some factbook pages with chrome (headers, footers, etc.)
|
7
|
+
## are NOT valid utf-8, thus,
|
8
|
+
## treat page as is (e.g. ASCII8BIT)
|
9
|
+
#
|
10
|
+
# only convert to utf8 when header and footer got stripped
|
11
|
+
|
12
|
+
##
|
13
|
+
## be/benin:
|
14
|
+
## Key Force or FC [Lazare S?xx?HOU?xx?TO] -- two invalid byte code chars in Political parties and leaders:
|
15
|
+
#
|
16
|
+
## in Western/Windows-1252 leads to FC [Lazare SÈHOUÉTO];
|
17
|
+
# Lazare Sèhouéto
|
18
|
+
#
|
19
|
+
# looks good - use (assume) Windows-1252 ????
|
20
|
+
|
21
|
+
##
|
22
|
+
# check for is ascii 7-bit ??? if yes -noworries
|
23
|
+
# if not, log number of chars not using ascii 7-bit
|
24
|
+
|
25
|
+
|
26
|
+
|
27
|
+
class Page
|
28
|
+
include LogUtils::Logging
|
29
|
+
|
30
|
+
attr_reader :sects ## "structured" access e.g. sects/subsects/etc.
|
31
|
+
attr_reader :info ## meta info e.g. country_code, country_name, region_name, last_updated, etc.
|
32
|
+
attr_reader :data ## "plain" access with vanilla hash
|
33
|
+
|
34
|
+
|
35
|
+
## standard version (note: requires https)
|
36
|
+
SITE_BASE = 'https://www.cia.gov/library/publications/the-world-factbook/geos/{code}.html'
|
37
|
+
|
38
|
+
def initialize( code, opts={} )
|
39
|
+
### keep code - why? why not?? (use page_info/info e.g. info.country_code??)
|
40
|
+
|
41
|
+
if opts[:json]
|
42
|
+
json = opts[:json] ## note: json is (still) a string/text (NOT yet parsed to structured data)
|
43
|
+
b = JsonBuilder.from_string( json )
|
44
|
+
else ## assume html
|
45
|
+
if opts[:html] ## note: expects ASCII-7BIT/BINARY encoding
|
46
|
+
## for debugging and testing allow "custom" passed-in html page
|
47
|
+
html = opts[:html]
|
48
|
+
else
|
49
|
+
url_string = SITE_BASE.gsub( '{code}', code )
|
50
|
+
## note: expects ASCII-7BIT/BINARY encoding
|
51
|
+
|
52
|
+
## html = fetch_page( url_string ) ## use PageFetcher class - why?? why not??
|
53
|
+
html = Webcache.read( url_string )
|
54
|
+
end
|
55
|
+
b = Builder.from_string( html )
|
56
|
+
end
|
57
|
+
|
58
|
+
@sects = b.sects
|
59
|
+
@info = b.info
|
60
|
+
|
61
|
+
## todo/fix/quick hack:
|
62
|
+
## check for info opts hash entry - lets you overwrite page info
|
63
|
+
## -- use proper header to setup page info - why, why not??
|
64
|
+
if opts[:info]
|
65
|
+
info = opts[:info]
|
66
|
+
@info = info
|
67
|
+
end
|
68
|
+
|
69
|
+
@data = {}
|
70
|
+
@sects.each do |sect|
|
71
|
+
@data[ sect.title ] = sect.data
|
72
|
+
end
|
73
|
+
|
74
|
+
self ## return self (check - not needed??)
|
75
|
+
end
|
76
|
+
|
77
|
+
|
78
|
+
def to_json( opts={} ) ## convenience helper for data.to_json; note: pretty print by default!
|
79
|
+
if opts[:minify]
|
80
|
+
data.to_json
|
81
|
+
else
|
82
|
+
## was: -- opts[:pretty] || opts[:pp]
|
83
|
+
JSON.pretty_generate( data ) ## note: pretty print by default!
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
|
88
|
+
def [](key) ### convenience shortcut
|
89
|
+
# lets you use
|
90
|
+
# page['geo']
|
91
|
+
# instead of
|
92
|
+
# page.data['geo']
|
93
|
+
|
94
|
+
## fix: use delegate data, [] from forwardable lib - why?? why not??
|
95
|
+
|
96
|
+
data[key]
|
97
|
+
end
|
98
|
+
|
99
|
+
## add convenience (shortcut) accessors / attributes / fields / getters
|
100
|
+
|
101
|
+
ATTRIBUTES.each do |attrib|
|
102
|
+
## e.g.
|
103
|
+
## def background() data['Introduction']['Background']['text']; end
|
104
|
+
## def location() data['Geography']['Location']['text']; end
|
105
|
+
## etc.
|
106
|
+
if attrib.path.size == 1
|
107
|
+
define_method attrib.name.to_sym do
|
108
|
+
@data.fetch( attrib.category, {} ).
|
109
|
+
fetch( attrib.path[0], {} )['text']
|
110
|
+
end
|
111
|
+
else ## assume size 2 for now
|
112
|
+
define_method attrib.name.to_sym do
|
113
|
+
@data.fetch( attrib.category, {} ).
|
114
|
+
fetch( attrib.path[0], {} ).
|
115
|
+
fetch( attrib.path[1], {} )['text']
|
116
|
+
end
|
117
|
+
end
|
118
|
+
end
|
119
|
+
|
120
|
+
|
121
|
+
private
|
122
|
+
def fetch_page( url )
|
123
|
+
response = Webget.page( url )
|
124
|
+
|
125
|
+
## note: exit on get / fetch error - do NOT continue for now - why? why not?
|
126
|
+
exit 1 if response.status.nok? ## e.g. HTTP status code != 200
|
127
|
+
|
128
|
+
|
129
|
+
response.text
|
130
|
+
end
|
131
|
+
|
132
|
+
|
133
|
+
=begin
|
134
|
+
def self.from_url( cc, cn )
|
135
|
+
html_ascii = PageFetcher.new.fetch( cc )
|
136
|
+
self.new( cc, cn, html_ascii )
|
137
|
+
end
|
138
|
+
|
139
|
+
def self.from_file( cc, cn, opts={} )
|
140
|
+
input_dir = opts[:input_dir] || '.'
|
141
|
+
html_ascii = File.read( "#{input_dir}/#{cc}.html" ) ## fix/todo: use ASCII8BIT/binary reader
|
142
|
+
self.new( cc, cn, html_ascii )
|
143
|
+
end
|
144
|
+
=end
|
145
|
+
|
146
|
+
|
147
|
+
end # class Page
|
148
|
+
end # module Factbook
|