factbook-readers 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +3 -0
- data/Manifest.txt +56 -0
- data/README.md +196 -0
- data/Rakefile +34 -0
- data/data/attributes.yml +337 -0
- data/data/categories.csv +164 -0
- data/data/codes.csv +262 -0
- data/data/codesxref.csv +280 -0
- data/data/comparisons.csv +75 -0
- data/lib/factbook-readers.rb +59 -0
- data/lib/factbook-readers/attributes.rb +74 -0
- data/lib/factbook-readers/builder.rb +212 -0
- data/lib/factbook-readers/builder_item.rb +185 -0
- data/lib/factbook-readers/builder_json.rb +79 -0
- data/lib/factbook-readers/codes.rb +122 -0
- data/lib/factbook-readers/comparisons.rb +50 -0
- data/lib/factbook-readers/counter.rb +48 -0
- data/lib/factbook-readers/normalize.rb +43 -0
- data/lib/factbook-readers/page.rb +148 -0
- data/lib/factbook-readers/page_info.rb +12 -0
- data/lib/factbook-readers/reader_json.rb +51 -0
- data/lib/factbook-readers/sanitizer.rb +307 -0
- data/lib/factbook-readers/sect.rb +29 -0
- data/lib/factbook-readers/subsect.rb +18 -0
- data/lib/factbook-readers/table.rb +52 -0
- data/lib/factbook-readers/utils.rb +47 -0
- data/lib/factbook-readers/utils_info.rb +129 -0
- data/lib/factbook-readers/version.rb +24 -0
- data/lib/factbook/readers.rb +5 -0
- data/test/data/au.html +579 -0
- data/test/data/au.yml +8 -0
- data/test/data/be.html +596 -0
- data/test/data/be.yml +8 -0
- data/test/data/json/au.json +892 -0
- data/test/data/src/ag.html +716 -0
- data/test/data/src/au-2015-09-24.html +2006 -0
- data/test/data/src/au.html +658 -0
- data/test/data/src/be-2015-09-24.html +2011 -0
- data/test/data/src/be.html +648 -0
- data/test/helper.rb +11 -0
- data/test/test_attribs.rb +87 -0
- data/test/test_attribs_def.rb +20 -0
- data/test/test_builder.rb +35 -0
- data/test/test_codes.rb +76 -0
- data/test/test_comparisons.rb +19 -0
- data/test/test_convert.rb +30 -0
- data/test/test_counter.rb +31 -0
- data/test/test_fields.rb +52 -0
- data/test/test_importer.rb +56 -0
- data/test/test_item_builder.rb +99 -0
- data/test/test_json.rb +45 -0
- data/test/test_json_builder.rb +25 -0
- data/test/test_normalize.rb +23 -0
- data/test/test_page.rb +38 -0
- data/test/test_sanitizer.rb +39 -0
- data/test/test_sanitizer_regex.rb +89 -0
- metadata +196 -0
@@ -0,0 +1,79 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
module Factbook
|
4
|
+
|
5
|
+
######
|
6
|
+
# json builder -- lets us rebuild a page from "dumped" json (instead of parsing html page)
|
7
|
+
|
8
|
+
class JsonBuilder
|
9
|
+
include LogUtils::Logging
|
10
|
+
include NormalizeHelper ## e.g. normalize_category
|
11
|
+
|
12
|
+
|
13
|
+
def self.from_file( path )
|
14
|
+
text = File.read( path ) ## fix: use File.read_utf8 from textutils
|
15
|
+
self.from_string( text )
|
16
|
+
end
|
17
|
+
|
18
|
+
def self.from_string( text )
|
19
|
+
self.new( text )
|
20
|
+
end
|
21
|
+
|
22
|
+
|
23
|
+
attr_reader :text,
|
24
|
+
:json,
|
25
|
+
:info, ## not used yet -- page info incl. country_name, region_name, last_updated etc.
|
26
|
+
:errors, ## not used yet -- encoding erros etc.
|
27
|
+
:sects
|
28
|
+
|
29
|
+
|
30
|
+
def initialize( text )
|
31
|
+
@text = text
|
32
|
+
|
33
|
+
@json = JSON.parse( text )
|
34
|
+
|
35
|
+
@info = nil ## fix/todo: sorry - for now no page info (use header in json - why? why not??)
|
36
|
+
@errors = [] ## fix/todo: sorry - for now no errors possible/tracked
|
37
|
+
|
38
|
+
@sects = []
|
39
|
+
|
40
|
+
@json.each do |k1,v1|
|
41
|
+
sect_title = k1
|
42
|
+
sect_subsects = v1
|
43
|
+
|
44
|
+
sect = Sect.new
|
45
|
+
sect.title = sect_title
|
46
|
+
|
47
|
+
## get subsections
|
48
|
+
subsects = []
|
49
|
+
sect_subsects.each do |k2,v2|
|
50
|
+
subsect_title = k2
|
51
|
+
subsect_data = v2
|
52
|
+
|
53
|
+
subsect = Subsect.new
|
54
|
+
subsect.title = subsect_title
|
55
|
+
|
56
|
+
#####
|
57
|
+
## note: run data hash through normalize_category (again)
|
58
|
+
if subsect_data.is_a?( Hash )
|
59
|
+
new_subsect_data = {}
|
60
|
+
subsect_data.each do |k3,v3|
|
61
|
+
new_subsect_data[ normalize_category(k3) ] = v3
|
62
|
+
end
|
63
|
+
subsect_data = new_subsect_data
|
64
|
+
end
|
65
|
+
|
66
|
+
subsect.data = subsect_data
|
67
|
+
|
68
|
+
subsects << subsect
|
69
|
+
end
|
70
|
+
|
71
|
+
sect.subsects = subsects
|
72
|
+
@sects << sect
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
end # class JsonBuilder
|
77
|
+
|
78
|
+
|
79
|
+
end # module Factbook
|
@@ -0,0 +1,122 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
##
|
4
|
+
# note:
|
5
|
+
# the factbook category/region for world is other entities (on FAQ) and oceans in page
|
6
|
+
# changed to world
|
7
|
+
|
8
|
+
|
9
|
+
module Factbook
|
10
|
+
|
11
|
+
class Codes
|
12
|
+
|
13
|
+
Code = Struct.new( :code, ## todo: add notes (country affiliation) - why? why not??
|
14
|
+
:name,
|
15
|
+
:category, ## e.g. Countries, Other, Oceans, World, Dependencies, etc.
|
16
|
+
:region, ## e.g. Europe, Oceans, etc.
|
17
|
+
)
|
18
|
+
|
19
|
+
def self.from_csv( path )
|
20
|
+
###
|
21
|
+
# note:
|
22
|
+
# if you use quotes - NO leading spaces allowed e.g.
|
23
|
+
# use au,"Austria",... and NOT
|
24
|
+
# au, "Austria", ...
|
25
|
+
#
|
26
|
+
# for headers - NO leading spaces allowed e.g.
|
27
|
+
# use Code,Name,Category,Region,... and NOT
|
28
|
+
# Code, Name, Category, Region, ...
|
29
|
+
|
30
|
+
rows = CsvHash.read( path )
|
31
|
+
|
32
|
+
pp rows
|
33
|
+
|
34
|
+
recs = []
|
35
|
+
rows.each do |row|
|
36
|
+
pp row
|
37
|
+
rec = Code.new
|
38
|
+
rec.code = row['Code'].strip ## remove leading n trailing whitespaces
|
39
|
+
rec.name = row['Name'].strip
|
40
|
+
|
41
|
+
## note: for now category and region are optional
|
42
|
+
rec.category = row['Category'].strip if row['Category'] && row['Category'].size > 0
|
43
|
+
rec.region = row['Region'].strip if row['Region'] && row['Region'].size > 0
|
44
|
+
|
45
|
+
pp rec
|
46
|
+
recs << rec
|
47
|
+
end
|
48
|
+
|
49
|
+
self.new( recs )
|
50
|
+
end
|
51
|
+
|
52
|
+
def initialize( codes )
|
53
|
+
@codes = codes
|
54
|
+
end
|
55
|
+
|
56
|
+
def size() @codes.size; end
|
57
|
+
|
58
|
+
def each( &blk ) @codes.each( &blk ); end
|
59
|
+
def select( &blk )
|
60
|
+
codes = @codes.select( &blk )
|
61
|
+
Codes.new( codes ) ## return (again) new Codes obj for easy-chaining - why? why not?
|
62
|
+
end
|
63
|
+
|
64
|
+
|
65
|
+
def to_a
|
66
|
+
@codes.collect {|code| code.code } ## return array of codes
|
67
|
+
end
|
68
|
+
|
69
|
+
## def all() self.to_a; end ## note: alias for to_a - use - why? why not??
|
70
|
+
|
71
|
+
## "pre-defined" convenience shortcuts
|
72
|
+
def countries() category 'Countries'; end
|
73
|
+
def world() category 'World'; end
|
74
|
+
def oceans() category 'Oceans'; end
|
75
|
+
def misc() category 'Miscellaneous'; end
|
76
|
+
def others() category 'Other'; end
|
77
|
+
def dependencies() category 'Dependencies'; end
|
78
|
+
def dependencies_us() category 'Dependencies (United States)'; end
|
79
|
+
## fix/todo: add all dependencies uk (or gb?), fr,cn,au,nz,no,dk,etc.
|
80
|
+
|
81
|
+
def europe() region 'Europe'; end
|
82
|
+
def south_asia() region 'South Asia'; end
|
83
|
+
def central_asia() region 'Central Asia'; end
|
84
|
+
def east_n_souteast_asia() region 'East & Southeast Asia'; end
|
85
|
+
def middle_east() region 'Middle East'; end
|
86
|
+
def africa() region 'Africa'; end
|
87
|
+
def north_america() region 'North America'; end
|
88
|
+
def central_america_n_caribbean() region 'Central America and Caribbean'; end
|
89
|
+
def south_america() region 'South America'; end
|
90
|
+
def australia_oceania() region 'Australia-Oceania'; end
|
91
|
+
def antartica() region 'Antarctica'; end
|
92
|
+
|
93
|
+
## note: regions oceans and world - same as category oceans and world
|
94
|
+
## use oceans_ii or world_ii or something ??
|
95
|
+
## use category('World') n region('World')
|
96
|
+
## use category('Oceans') n region('Oceans')
|
97
|
+
|
98
|
+
|
99
|
+
def category( query )
|
100
|
+
## todo/future: allow passing in of regex too (not just string)
|
101
|
+
## note: e.g. Dependencies (France) needs to get escpaed to
|
102
|
+
## Dependencies \(France\) etc.
|
103
|
+
filter_regex = /#{Regexp.escape(query)}/i
|
104
|
+
codes = @codes.select do |code|
|
105
|
+
code.category ? filter_regex.match( code.category ) : false ## note: allow nil for category; will fail on search
|
106
|
+
end
|
107
|
+
Codes.new( codes ) ## return new Codes obj for easy-chaining
|
108
|
+
end
|
109
|
+
|
110
|
+
def region( query )
|
111
|
+
## todo/future: allow passing in of regex too (not just string)
|
112
|
+
filter_regex = /#{Regexp.escape(query)}/i
|
113
|
+
codes = @codes.select do |code|
|
114
|
+
code.region ? filter_regex.match( code.region ) : false ## note: allow nil for region; will fail on search
|
115
|
+
end
|
116
|
+
Codes.new( codes ) ## return new Codes obj for easy-chaining
|
117
|
+
end
|
118
|
+
|
119
|
+
end # class codes
|
120
|
+
|
121
|
+
end # module Factbook
|
122
|
+
|
@@ -0,0 +1,50 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
module Factbook
|
4
|
+
|
5
|
+
class Comparisons
|
6
|
+
|
7
|
+
Comparison = Struct.new( :num, ### todo: use no or id or something - why? why not?
|
8
|
+
:category, ## e.g. Geography, People, Economy, etc.
|
9
|
+
:name,
|
10
|
+
)
|
11
|
+
|
12
|
+
def self.from_csv( path )
|
13
|
+
|
14
|
+
rows = CsvHash.read( path )
|
15
|
+
|
16
|
+
pp rows
|
17
|
+
|
18
|
+
recs = []
|
19
|
+
rows.each do |row|
|
20
|
+
pp row
|
21
|
+
rec = Comparison.new
|
22
|
+
rec.num = row['Num'].strip.to_i ## remove leading n trailing whitespaces
|
23
|
+
rec.category = row['Category'].strip
|
24
|
+
rec.name = row['Name'].strip
|
25
|
+
|
26
|
+
pp rec
|
27
|
+
recs << rec
|
28
|
+
end
|
29
|
+
|
30
|
+
self.new( recs )
|
31
|
+
end
|
32
|
+
|
33
|
+
def initialize( comps )
|
34
|
+
@comps = comps
|
35
|
+
end
|
36
|
+
|
37
|
+
def size() @comps.size; end
|
38
|
+
|
39
|
+
def each
|
40
|
+
@comps.each {|comp| yield( comp ) }
|
41
|
+
end
|
42
|
+
|
43
|
+
def to_a
|
44
|
+
@comps.collect {|comp| comp.num } ## return array of nums -- return something else - why? why not?
|
45
|
+
end
|
46
|
+
|
47
|
+
end # class Comparison
|
48
|
+
|
49
|
+
end # module Factbook
|
50
|
+
|
@@ -0,0 +1,48 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
module Factbook
|
4
|
+
|
5
|
+
class Counter
|
6
|
+
|
7
|
+
attr_reader :data
|
8
|
+
|
9
|
+
def initialize
|
10
|
+
@data = {}
|
11
|
+
end
|
12
|
+
|
13
|
+
def count( page )
|
14
|
+
|
15
|
+
## walk page data hash
|
16
|
+
# add nodes to data
|
17
|
+
|
18
|
+
walk( page, page.data, @data )
|
19
|
+
end
|
20
|
+
|
21
|
+
|
22
|
+
private
|
23
|
+
def walk( page, hin, hout )
|
24
|
+
hin.each do |k,v|
|
25
|
+
if v.is_a? Hash
|
26
|
+
hout2 = hout[k] || { count: 0, codes: '' }
|
27
|
+
|
28
|
+
hout2[ :count ] += 1
|
29
|
+
|
30
|
+
## delete codes if larger (treshhold) than x (e.g. 9)
|
31
|
+
hout2.delete( :codes ) if hout2[ :count ] > 9
|
32
|
+
|
33
|
+
codes = hout2[ :codes ]
|
34
|
+
if codes ## note: might got deleted if passed treshhold (e.g. 9 entries)
|
35
|
+
codes << ' ' unless codes.empty? ## add separator (space for now)
|
36
|
+
codes << page.info.country_code
|
37
|
+
hout2[ :codes ] = codes
|
38
|
+
end
|
39
|
+
|
40
|
+
hout[k] = hout2
|
41
|
+
walk( page, v, hout2 )
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
end # class Counter
|
47
|
+
|
48
|
+
end # module Factbook
|
@@ -0,0 +1,43 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
module Factbook
|
4
|
+
module NormalizeHelper
|
5
|
+
|
6
|
+
|
7
|
+
def normalize_category( text )
|
8
|
+
|
9
|
+
## note: fix typos/errors with double colons e.g. note:: (instead of note:)
|
10
|
+
|
11
|
+
text = text.strip
|
12
|
+
text = text.sub( /:+\z/, '' ) # remove trailing : if present -- note: allow (fix) note:: too, thus, use :+
|
13
|
+
text = text.strip
|
14
|
+
|
15
|
+
#######################################
|
16
|
+
### special cases
|
17
|
+
|
18
|
+
## typos e.g ntoe => use note
|
19
|
+
text = 'note' if text == 'ntoe'
|
20
|
+
text = 'investment in fixed capital' if text == 'investment if fixed capital'
|
21
|
+
|
22
|
+
## downcase
|
23
|
+
text = 'lowest point' if text == 'Lowest point'
|
24
|
+
text = 'chief of state' if text == 'Chief of state'
|
25
|
+
|
26
|
+
## spelling variant (use more popular one)
|
27
|
+
text = 'signed, but not ratified' if text == 'signed but not ratified'
|
28
|
+
text = 'vectorborne diseases' if text == 'vectorborne disease'
|
29
|
+
text = 'water contact disease' if text == 'water contact diseases'
|
30
|
+
text = 'food or waterborne diseases' if text == 'food or waterborne disease'
|
31
|
+
text = 'geographic coordinates' if text == 'geographical coordinates'
|
32
|
+
text = 'note' if text == 'notes'
|
33
|
+
text = 'refugees (country of origin)' if text == 'refugees (countries of origin)'
|
34
|
+
|
35
|
+
## border countries (8): -- remove (x) counter
|
36
|
+
text = 'border countries' if text.start_with?( 'border countries')
|
37
|
+
|
38
|
+
text
|
39
|
+
end
|
40
|
+
|
41
|
+
|
42
|
+
end # module NormalizeHelper
|
43
|
+
end # module Factbook
|
@@ -0,0 +1,148 @@
|
|
1
|
+
|
2
|
+
module Factbook
|
3
|
+
|
4
|
+
|
5
|
+
## note:
|
6
|
+
## some factbook pages with chrome (headers, footers, etc.)
|
7
|
+
## are NOT valid utf-8, thus,
|
8
|
+
## treat page as is (e.g. ASCII8BIT)
|
9
|
+
#
|
10
|
+
# only convert to utf8 when header and footer got stripped
|
11
|
+
|
12
|
+
##
|
13
|
+
## be/benin:
|
14
|
+
## Key Force or FC [Lazare S?xx?HOU?xx?TO] -- two invalid byte code chars in Political parties and leaders:
|
15
|
+
#
|
16
|
+
## in Western/Windows-1252 leads to FC [Lazare SÈHOUÉTO];
|
17
|
+
# Lazare Sèhouéto
|
18
|
+
#
|
19
|
+
# looks good - use (assume) Windows-1252 ????
|
20
|
+
|
21
|
+
##
|
22
|
+
# check for is ascii 7-bit ??? if yes -noworries
|
23
|
+
# if not, log number of chars not using ascii 7-bit
|
24
|
+
|
25
|
+
|
26
|
+
|
27
|
+
class Page
|
28
|
+
include LogUtils::Logging
|
29
|
+
|
30
|
+
attr_reader :sects ## "structured" access e.g. sects/subsects/etc.
|
31
|
+
attr_reader :info ## meta info e.g. country_code, country_name, region_name, last_updated, etc.
|
32
|
+
attr_reader :data ## "plain" access with vanilla hash
|
33
|
+
|
34
|
+
|
35
|
+
## standard version (note: requires https)
|
36
|
+
SITE_BASE = 'https://www.cia.gov/library/publications/the-world-factbook/geos/{code}.html'
|
37
|
+
|
38
|
+
def initialize( code, opts={} )
|
39
|
+
### keep code - why? why not?? (use page_info/info e.g. info.country_code??)
|
40
|
+
|
41
|
+
if opts[:json]
|
42
|
+
json = opts[:json] ## note: json is (still) a string/text (NOT yet parsed to structured data)
|
43
|
+
b = JsonBuilder.from_string( json )
|
44
|
+
else ## assume html
|
45
|
+
if opts[:html] ## note: expects ASCII-7BIT/BINARY encoding
|
46
|
+
## for debugging and testing allow "custom" passed-in html page
|
47
|
+
html = opts[:html]
|
48
|
+
else
|
49
|
+
url_string = SITE_BASE.gsub( '{code}', code )
|
50
|
+
## note: expects ASCII-7BIT/BINARY encoding
|
51
|
+
|
52
|
+
## html = fetch_page( url_string ) ## use PageFetcher class - why?? why not??
|
53
|
+
html = Webcache.read( url_string )
|
54
|
+
end
|
55
|
+
b = Builder.from_string( html )
|
56
|
+
end
|
57
|
+
|
58
|
+
@sects = b.sects
|
59
|
+
@info = b.info
|
60
|
+
|
61
|
+
## todo/fix/quick hack:
|
62
|
+
## check for info opts hash entry - lets you overwrite page info
|
63
|
+
## -- use proper header to setup page info - why, why not??
|
64
|
+
if opts[:info]
|
65
|
+
info = opts[:info]
|
66
|
+
@info = info
|
67
|
+
end
|
68
|
+
|
69
|
+
@data = {}
|
70
|
+
@sects.each do |sect|
|
71
|
+
@data[ sect.title ] = sect.data
|
72
|
+
end
|
73
|
+
|
74
|
+
self ## return self (check - not needed??)
|
75
|
+
end
|
76
|
+
|
77
|
+
|
78
|
+
def to_json( opts={} ) ## convenience helper for data.to_json; note: pretty print by default!
|
79
|
+
if opts[:minify]
|
80
|
+
data.to_json
|
81
|
+
else
|
82
|
+
## was: -- opts[:pretty] || opts[:pp]
|
83
|
+
JSON.pretty_generate( data ) ## note: pretty print by default!
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
|
88
|
+
def [](key) ### convenience shortcut
|
89
|
+
# lets you use
|
90
|
+
# page['geo']
|
91
|
+
# instead of
|
92
|
+
# page.data['geo']
|
93
|
+
|
94
|
+
## fix: use delegate data, [] from forwardable lib - why?? why not??
|
95
|
+
|
96
|
+
data[key]
|
97
|
+
end
|
98
|
+
|
99
|
+
## add convenience (shortcut) accessors / attributes / fields / getters
|
100
|
+
|
101
|
+
ATTRIBUTES.each do |attrib|
|
102
|
+
## e.g.
|
103
|
+
## def background() data['Introduction']['Background']['text']; end
|
104
|
+
## def location() data['Geography']['Location']['text']; end
|
105
|
+
## etc.
|
106
|
+
if attrib.path.size == 1
|
107
|
+
define_method attrib.name.to_sym do
|
108
|
+
@data.fetch( attrib.category, {} ).
|
109
|
+
fetch( attrib.path[0], {} )['text']
|
110
|
+
end
|
111
|
+
else ## assume size 2 for now
|
112
|
+
define_method attrib.name.to_sym do
|
113
|
+
@data.fetch( attrib.category, {} ).
|
114
|
+
fetch( attrib.path[0], {} ).
|
115
|
+
fetch( attrib.path[1], {} )['text']
|
116
|
+
end
|
117
|
+
end
|
118
|
+
end
|
119
|
+
|
120
|
+
|
121
|
+
private
|
122
|
+
def fetch_page( url )
|
123
|
+
response = Webget.page( url )
|
124
|
+
|
125
|
+
## note: exit on get / fetch error - do NOT continue for now - why? why not?
|
126
|
+
exit 1 if response.status.nok? ## e.g. HTTP status code != 200
|
127
|
+
|
128
|
+
|
129
|
+
response.text
|
130
|
+
end
|
131
|
+
|
132
|
+
|
133
|
+
=begin
|
134
|
+
def self.from_url( cc, cn )
|
135
|
+
html_ascii = PageFetcher.new.fetch( cc )
|
136
|
+
self.new( cc, cn, html_ascii )
|
137
|
+
end
|
138
|
+
|
139
|
+
def self.from_file( cc, cn, opts={} )
|
140
|
+
input_dir = opts[:input_dir] || '.'
|
141
|
+
html_ascii = File.read( "#{input_dir}/#{cc}.html" ) ## fix/todo: use ASCII8BIT/binary reader
|
142
|
+
self.new( cc, cn, html_ascii )
|
143
|
+
end
|
144
|
+
=end
|
145
|
+
|
146
|
+
|
147
|
+
end # class Page
|
148
|
+
end # module Factbook
|