factbook 2.0.0 → 2.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +1 -1
- data/Manifest.txt +0 -61
- data/README.md +8 -506
- data/Rakefile +4 -9
- data/lib/factbook.rb +4 -64
- metadata +6 -124
- data/data/attributes.yml +0 -337
- data/data/categories.csv +0 -164
- data/data/codes.csv +0 -262
- data/data/codesxref.csv +0 -280
- data/data/comparisons.csv +0 -75
- data/lib/factbook/almanac.rb +0 -72
- data/lib/factbook/attributes.rb +0 -74
- data/lib/factbook/builder.rb +0 -212
- data/lib/factbook/builder_item.rb +0 -126
- data/lib/factbook/builder_json.rb +0 -79
- data/lib/factbook/codes.rb +0 -119
- data/lib/factbook/comparisons.rb +0 -50
- data/lib/factbook/counter.rb +0 -48
- data/lib/factbook/db/importer.rb +0 -92
- data/lib/factbook/db/models.rb +0 -11
- data/lib/factbook/db/schema.rb +0 -36
- data/lib/factbook/normalize.rb +0 -43
- data/lib/factbook/page.rb +0 -148
- data/lib/factbook/page_info.rb +0 -12
- data/lib/factbook/reader_json.rb +0 -51
- data/lib/factbook/sanitizer.rb +0 -178
- data/lib/factbook/sect.rb +0 -29
- data/lib/factbook/subsect.rb +0 -18
- data/lib/factbook/table.rb +0 -52
- data/lib/factbook/utils.rb +0 -85
- data/lib/factbook/utils_info.rb +0 -129
- data/lib/factbook/version.rb +0 -21
- data/script/almanac.rb +0 -48
- data/script/attributes.rb +0 -34
- data/script/build.rb +0 -28
- data/script/counter.rb +0 -145
- data/script/json.rb +0 -19
- data/script/testbr.rb +0 -33
- data/script/testcodes.rb +0 -11
- data/test/data/au.html +0 -579
- data/test/data/au.yml +0 -8
- data/test/data/be.html +0 -596
- data/test/data/be.yml +0 -8
- data/test/data/json/au.json +0 -892
- data/test/data/src/ag.html +0 -716
- data/test/data/src/au-2015-09-24.html +0 -2006
- data/test/data/src/au.html +0 -658
- data/test/data/src/be-2015-09-24.html +0 -2011
- data/test/data/src/be.html +0 -648
- data/test/helper.rb +0 -11
- data/test/test_attribs.rb +0 -87
- data/test/test_attribs_def.rb +0 -20
- data/test/test_builder.rb +0 -35
- data/test/test_codes.rb +0 -76
- data/test/test_comparisons.rb +0 -19
- data/test/test_convert.rb +0 -30
- data/test/test_counter.rb +0 -31
- data/test/test_fields.rb +0 -52
- data/test/test_importer.rb +0 -56
- data/test/test_item_builder.rb +0 -99
- data/test/test_json.rb +0 -45
- data/test/test_json_builder.rb +0 -25
- data/test/test_normalize.rb +0 -23
- data/test/test_page.rb +0 -38
- data/test/test_sanitizer.rb +0 -39
- data/test/test_sanitizer_regex.rb +0 -89
@@ -1,79 +0,0 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
|
-
module Factbook
|
4
|
-
|
5
|
-
######
|
6
|
-
# json builder -- lets us rebuild a page from "dumped" json (instead of parsing html page)
|
7
|
-
|
8
|
-
class JsonBuilder
|
9
|
-
include LogUtils::Logging
|
10
|
-
include NormalizeHelper ## e.g. normalize_category
|
11
|
-
|
12
|
-
|
13
|
-
def self.from_file( path )
|
14
|
-
text = File.read( path ) ## fix: use File.read_utf8 from textutils
|
15
|
-
self.from_string( text )
|
16
|
-
end
|
17
|
-
|
18
|
-
def self.from_string( text )
|
19
|
-
self.new( text )
|
20
|
-
end
|
21
|
-
|
22
|
-
|
23
|
-
attr_reader :text,
|
24
|
-
:json,
|
25
|
-
:info, ## not used yet -- page info incl. country_name, region_name, last_updated etc.
|
26
|
-
:errors, ## not used yet -- encoding erros etc.
|
27
|
-
:sects
|
28
|
-
|
29
|
-
|
30
|
-
def initialize( text )
|
31
|
-
@text = text
|
32
|
-
|
33
|
-
@json = JSON.parse( text )
|
34
|
-
|
35
|
-
@info = nil ## fix/todo: sorry - for now no page info (use header in json - why? why not??)
|
36
|
-
@errors = [] ## fix/todo: sorry - for now no errors possible/tracked
|
37
|
-
|
38
|
-
@sects = []
|
39
|
-
|
40
|
-
@json.each do |k1,v1|
|
41
|
-
sect_title = k1
|
42
|
-
sect_subsects = v1
|
43
|
-
|
44
|
-
sect = Sect.new
|
45
|
-
sect.title = sect_title
|
46
|
-
|
47
|
-
## get subsections
|
48
|
-
subsects = []
|
49
|
-
sect_subsects.each do |k2,v2|
|
50
|
-
subsect_title = k2
|
51
|
-
subsect_data = v2
|
52
|
-
|
53
|
-
subsect = Subsect.new
|
54
|
-
subsect.title = subsect_title
|
55
|
-
|
56
|
-
#####
|
57
|
-
## note: run data hash through normalize_category (again)
|
58
|
-
if subsect_data.is_a?( Hash )
|
59
|
-
new_subsect_data = {}
|
60
|
-
subsect_data.each do |k3,v3|
|
61
|
-
new_subsect_data[ normalize_category(k3) ] = v3
|
62
|
-
end
|
63
|
-
subsect_data = new_subsect_data
|
64
|
-
end
|
65
|
-
|
66
|
-
subsect.data = subsect_data
|
67
|
-
|
68
|
-
subsects << subsect
|
69
|
-
end
|
70
|
-
|
71
|
-
sect.subsects = subsects
|
72
|
-
@sects << sect
|
73
|
-
end
|
74
|
-
end
|
75
|
-
|
76
|
-
end # class JsonBuilder
|
77
|
-
|
78
|
-
|
79
|
-
end # module Factbook
|
data/lib/factbook/codes.rb
DELETED
@@ -1,119 +0,0 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
|
-
##
|
4
|
-
# note:
|
5
|
-
# the factbook category/region for world is other entities (on FAQ) and oceans in page
|
6
|
-
# changed to world
|
7
|
-
|
8
|
-
|
9
|
-
module Factbook
|
10
|
-
|
11
|
-
class Codes
|
12
|
-
|
13
|
-
Code = Struct.new( :code, ## todo: add notes (country affiliation) - why? why not??
|
14
|
-
:name,
|
15
|
-
:category, ## e.g. Countries, Other, Oceans, World, Dependencies, etc.
|
16
|
-
:region, ## e.g. Europe, Oceans, etc.
|
17
|
-
)
|
18
|
-
|
19
|
-
def self.from_csv( path )
|
20
|
-
###
|
21
|
-
# note:
|
22
|
-
# if you use quotes - NO leading spaces allowed e.g.
|
23
|
-
# use au,"Austria",... and NOT
|
24
|
-
# au, "Austria", ...
|
25
|
-
#
|
26
|
-
# for headers - NO leading spaces allowed e.g.
|
27
|
-
# use Code,Name,Category,Region,... and NOT
|
28
|
-
# Code, Name, Category, Region, ...
|
29
|
-
|
30
|
-
rows = CSV.read( path, headers: true )
|
31
|
-
|
32
|
-
pp rows
|
33
|
-
|
34
|
-
recs = []
|
35
|
-
rows.each do |row|
|
36
|
-
pp row
|
37
|
-
rec = Code.new
|
38
|
-
rec.code = row['Code'].strip ## remove leading n trailing whitespaces
|
39
|
-
rec.name = row['Name'].strip
|
40
|
-
|
41
|
-
## note: for now category and region are optional
|
42
|
-
rec.category = row['Category'].strip if row['Category']
|
43
|
-
rec.region = row['Region'].strip if row['Region']
|
44
|
-
|
45
|
-
pp rec
|
46
|
-
recs << rec
|
47
|
-
end
|
48
|
-
|
49
|
-
self.new( recs )
|
50
|
-
end
|
51
|
-
|
52
|
-
def initialize( codes )
|
53
|
-
@codes = codes
|
54
|
-
end
|
55
|
-
|
56
|
-
def size() @codes.size; end
|
57
|
-
|
58
|
-
def each
|
59
|
-
@codes.each {|code| yield( code ) }
|
60
|
-
end
|
61
|
-
|
62
|
-
def to_a
|
63
|
-
@codes.collect {|code| code.code } ## return array of codes
|
64
|
-
end
|
65
|
-
|
66
|
-
## def all() self.to_a; end ## note: alias for to_a - use - why? why not??
|
67
|
-
|
68
|
-
## "pre-defined" convenience shortcuts
|
69
|
-
def countries() category 'Countries'; end
|
70
|
-
def world() category 'World'; end
|
71
|
-
def oceans() category 'Oceans'; end
|
72
|
-
def misc() category 'Miscellaneous'; end
|
73
|
-
def others() category 'Other'; end
|
74
|
-
def dependencies() category 'Dependencies'; end
|
75
|
-
def dependencies_us() category 'Dependencies (United States)'; end
|
76
|
-
## fix/todo: add all dependencies uk (or gb?), fr,cn,au,nz,no,dk,etc.
|
77
|
-
|
78
|
-
def europe() region 'Europe'; end
|
79
|
-
def south_asia() region 'South Asia'; end
|
80
|
-
def central_asia() region 'Central Asia'; end
|
81
|
-
def east_n_souteast_asia() region 'East & Southeast Asia'; end
|
82
|
-
def middle_east() region 'Middle East'; end
|
83
|
-
def africa() region 'Africa'; end
|
84
|
-
def north_america() region 'North America'; end
|
85
|
-
def central_america_n_caribbean() region 'Central America and Caribbean'; end
|
86
|
-
def south_america() region 'South America'; end
|
87
|
-
def australia_oceania() region 'Australia-Oceania'; end
|
88
|
-
def antartica() region 'Antarctica'; end
|
89
|
-
|
90
|
-
## note: regions oceans and world - same as category oceans and world
|
91
|
-
## use oceans_ii or world_ii or something ??
|
92
|
-
## use category('World') n region('World')
|
93
|
-
## use category('Oceans') n region('Oceans')
|
94
|
-
|
95
|
-
|
96
|
-
def category( query )
|
97
|
-
## todo/future: allow passing in of regex too (not just string)
|
98
|
-
## note: e.g. Dependencies (France) needs to get escpaed to
|
99
|
-
## Dependencies \(France\) etc.
|
100
|
-
filter_regex = /#{Regexp.escape(query)}/i
|
101
|
-
codes = @codes.select do |code|
|
102
|
-
code.category ? filter_regex.match( code.category ) : false ## note: allow nil for category; will fail on search
|
103
|
-
end
|
104
|
-
Codes.new( codes ) ## return new Codes obj for easy-chaining
|
105
|
-
end
|
106
|
-
|
107
|
-
def region( query )
|
108
|
-
## todo/future: allow passing in of regex too (not just string)
|
109
|
-
filter_regex = /#{Regexp.escape(query)}/i
|
110
|
-
codes = @codes.select do |code|
|
111
|
-
code.region ? filter_regex.match( code.region ) : false ## note: allow nil for region; will fail on search
|
112
|
-
end
|
113
|
-
Codes.new( codes ) ## return new Codes obj for easy-chaining
|
114
|
-
end
|
115
|
-
|
116
|
-
end # class codes
|
117
|
-
|
118
|
-
end # module Factbook
|
119
|
-
|
data/lib/factbook/comparisons.rb
DELETED
@@ -1,50 +0,0 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
|
-
module Factbook
|
4
|
-
|
5
|
-
class Comparisons
|
6
|
-
|
7
|
-
Comparison = Struct.new( :num, ### todo: use no or id or something - why? why not?
|
8
|
-
:category, ## e.g. Geography, People, Economy, etc.
|
9
|
-
:name,
|
10
|
-
)
|
11
|
-
|
12
|
-
def self.from_csv( path )
|
13
|
-
|
14
|
-
rows = CSV.read( path, headers: true )
|
15
|
-
|
16
|
-
pp rows
|
17
|
-
|
18
|
-
recs = []
|
19
|
-
rows.each do |row|
|
20
|
-
pp row
|
21
|
-
rec = Comparison.new
|
22
|
-
rec.num = row['Num'].strip.to_i ## remove leading n trailing whitespaces
|
23
|
-
rec.category = row['Category'].strip
|
24
|
-
rec.name = row['Name'].strip
|
25
|
-
|
26
|
-
pp rec
|
27
|
-
recs << rec
|
28
|
-
end
|
29
|
-
|
30
|
-
self.new( recs )
|
31
|
-
end
|
32
|
-
|
33
|
-
def initialize( comps )
|
34
|
-
@comps = comps
|
35
|
-
end
|
36
|
-
|
37
|
-
def size() @comps.size; end
|
38
|
-
|
39
|
-
def each
|
40
|
-
@comps.each {|comp| yield( comp ) }
|
41
|
-
end
|
42
|
-
|
43
|
-
def to_a
|
44
|
-
@comps.collect {|comp| comp.num } ## return array of nums -- return something else - why? why not?
|
45
|
-
end
|
46
|
-
|
47
|
-
end # class Comparison
|
48
|
-
|
49
|
-
end # module Factbook
|
50
|
-
|
data/lib/factbook/counter.rb
DELETED
@@ -1,48 +0,0 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
|
-
module Factbook
|
4
|
-
|
5
|
-
class Counter
|
6
|
-
|
7
|
-
attr_reader :data
|
8
|
-
|
9
|
-
def initialize
|
10
|
-
@data = {}
|
11
|
-
end
|
12
|
-
|
13
|
-
def count( page )
|
14
|
-
|
15
|
-
## walk page data hash
|
16
|
-
# add nodes to data
|
17
|
-
|
18
|
-
walk( page, page.data, @data )
|
19
|
-
end
|
20
|
-
|
21
|
-
|
22
|
-
private
|
23
|
-
def walk( page, hin, hout )
|
24
|
-
hin.each do |k,v|
|
25
|
-
if v.is_a? Hash
|
26
|
-
hout2 = hout[k] || { count: 0, codes: '' }
|
27
|
-
|
28
|
-
hout2[ :count ] += 1
|
29
|
-
|
30
|
-
## delete codes if larger (treshhold) than x (e.g. 9)
|
31
|
-
hout2.delete( :codes ) if hout2[ :count ] > 9
|
32
|
-
|
33
|
-
codes = hout2[ :codes ]
|
34
|
-
if codes ## note: might got deleted if passed treshhold (e.g. 9 entries)
|
35
|
-
codes << ' ' unless codes.empty? ## add separator (space for now)
|
36
|
-
codes << page.info.country_code
|
37
|
-
hout2[ :codes ] = codes
|
38
|
-
end
|
39
|
-
|
40
|
-
hout[k] = hout2
|
41
|
-
walk( page, v, hout2 )
|
42
|
-
end
|
43
|
-
end
|
44
|
-
end
|
45
|
-
|
46
|
-
end # class Counter
|
47
|
-
|
48
|
-
end # module Factbook
|
data/lib/factbook/db/importer.rb
DELETED
@@ -1,92 +0,0 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
|
-
module Factbook
|
4
|
-
|
5
|
-
class Importer
|
6
|
-
|
7
|
-
def import( page )
|
8
|
-
|
9
|
-
## note: assumes active connection
|
10
|
-
|
11
|
-
code = page.info.country_code
|
12
|
-
name = page.info.country_name
|
13
|
-
|
14
|
-
attribs = {
|
15
|
-
name: name,
|
16
|
-
area: sq_km( page.area ), # e.g. 83,871 sq km
|
17
|
-
area_land: sq_km( page.area_land ), # e.g. 82,445 sq km
|
18
|
-
area_water: sq_km( page.area_water ), # e.g. 1,426 sq km
|
19
|
-
|
20
|
-
population: num( page.population ), # e.g. 8,665,550 (July 2015 est.)
|
21
|
-
population_growth: percent( page.population_growth ), # e.g. 0.55% (2015 est.)
|
22
|
-
birth_rate: rate_per_thousand( page.birth_rate ), # e.g. 9.41 births/1,000 population (2015 est.)
|
23
|
-
death_rate: rate_per_thousand( page.death_rate ), # e.g. 9.42 deaths/1,000 population (2015 est.)
|
24
|
-
migration_rate: rate_per_thousand( page.migration_rate ), # e.g. 5.56 migrant(s)/1,000 population (2015 est.)
|
25
|
-
}
|
26
|
-
|
27
|
-
rec = Fact.find_by( code: code )
|
28
|
-
if rec.nil? ## create (new) record
|
29
|
-
rec = Fact.new
|
30
|
-
attribs[ :code ] = code
|
31
|
-
puts "create fact record #{code}/#{name}:"
|
32
|
-
else ## update (exisiting) record
|
33
|
-
puts "update fact record #{code}/#{name}:"
|
34
|
-
end
|
35
|
-
|
36
|
-
puts " #{attribs.inspect}"
|
37
|
-
rec.update_attributes!( attribs )
|
38
|
-
end
|
39
|
-
|
40
|
-
|
41
|
-
def rate_per_thousand( text )
|
42
|
-
# e.g. 9.41 births/1,000 population (2015 est.)
|
43
|
-
# 9.42 deaths/1,000 population (2015 est.)
|
44
|
-
# 5.56 migrant(s)/1,000 population (2015 est.)
|
45
|
-
|
46
|
-
if text =~/([0-9\.]+) [a-z\(\)]+\/1,000/
|
47
|
-
$1.to_f
|
48
|
-
else
|
49
|
-
puts "*** warn: unknown rate <name>/1,000 format (no match): >#{text}<"
|
50
|
-
nil
|
51
|
-
end
|
52
|
-
end
|
53
|
-
|
54
|
-
def num( text )
|
55
|
-
# e.g. 8,665,550 (July 2015 est.)
|
56
|
-
|
57
|
-
if text =~/([0-9,\.]+)/
|
58
|
-
$1.gsub(',', '').to_i ## note: remove commas (,) if present
|
59
|
-
else
|
60
|
-
puts "*** warn: unknown number format (no match): >#{text}<"
|
61
|
-
nil ## return nil
|
62
|
-
end
|
63
|
-
end
|
64
|
-
|
65
|
-
def percent( text )
|
66
|
-
# e.g. 0.55% (2015 est.)
|
67
|
-
|
68
|
-
if text =~/([0-9\.]+)%/
|
69
|
-
$1.to_f
|
70
|
-
else
|
71
|
-
puts "*** warn: unknown percent format (no match): >#{text}<"
|
72
|
-
nil ## return nil
|
73
|
-
end
|
74
|
-
end
|
75
|
-
|
76
|
-
def sq_km( text )
|
77
|
-
# e.g. 83,871 sq km
|
78
|
-
## todo - check vatican - uses float e.g. 0.44 ?? add support?
|
79
|
-
|
80
|
-
if text =~/([0-9,\.]+) sq km/
|
81
|
-
$1.gsub(',', '').to_i ## note: remove commas (,) if present
|
82
|
-
else
|
83
|
-
puts "*** warn: unknown sq km format (no match): >#{text}<"
|
84
|
-
nil ## return nil
|
85
|
-
end
|
86
|
-
end
|
87
|
-
|
88
|
-
|
89
|
-
end # class Importer
|
90
|
-
|
91
|
-
end # module Factbook
|
92
|
-
|
data/lib/factbook/db/models.rb
DELETED
data/lib/factbook/db/schema.rb
DELETED
@@ -1,36 +0,0 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
|
-
module Factbook
|
4
|
-
|
5
|
-
class CreateDb
|
6
|
-
|
7
|
-
def up
|
8
|
-
|
9
|
-
ActiveRecord::Schema.define do
|
10
|
-
|
11
|
-
create_table :facts do |t|
|
12
|
-
t.string :code, null: false # country code e.g. au
|
13
|
-
t.string :name, null: false # country name e.g. Austria
|
14
|
-
|
15
|
-
t.integer :area # e.g. 83,871 sq km
|
16
|
-
t.integer :area_land # e.g. 82,445 sq km --use float - why? why not?
|
17
|
-
t.integer :area_water # e.g. 1,426 sq km
|
18
|
-
|
19
|
-
t.integer :population # e.g. 8,665,550 (July 2015 est.)
|
20
|
-
t.float :population_growth # e.g. 0.55% (2015 est.)
|
21
|
-
t.float :birth_rate # e.g. 9.41 births/1,000 population (2015 est.)
|
22
|
-
t.float :death_rate # e.g. 9.42 deaths/1,000 population (2015 est.)
|
23
|
-
t.float :migration_rate # e.g. 5.56 migrant(s)/1,000 population (2015 est.)
|
24
|
-
|
25
|
-
t.timestamps
|
26
|
-
end
|
27
|
-
|
28
|
-
|
29
|
-
end # block Schema.define
|
30
|
-
|
31
|
-
end # method up
|
32
|
-
|
33
|
-
|
34
|
-
end # class CreateDb
|
35
|
-
|
36
|
-
end # module Factbook
|
data/lib/factbook/normalize.rb
DELETED
@@ -1,43 +0,0 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
|
-
module Factbook
|
4
|
-
module NormalizeHelper
|
5
|
-
|
6
|
-
|
7
|
-
def normalize_category( text )
|
8
|
-
|
9
|
-
## note: fix typos/errors with double colons e.g. note:: (instead of note:)
|
10
|
-
|
11
|
-
text = text.strip
|
12
|
-
text = text.sub( /:+\z/, '' ) # remove trailing : if present -- note: allow (fix) note:: too, thus, use :+
|
13
|
-
text = text.strip
|
14
|
-
|
15
|
-
#######################################
|
16
|
-
### special cases
|
17
|
-
|
18
|
-
## typos e.g ntoe => use note
|
19
|
-
text = 'note' if text == 'ntoe'
|
20
|
-
text = 'investment in fixed capital' if text == 'investment if fixed capital'
|
21
|
-
|
22
|
-
## downcase
|
23
|
-
text = 'lowest point' if text == 'Lowest point'
|
24
|
-
text = 'chief of state' if text == 'Chief of state'
|
25
|
-
|
26
|
-
## spelling variant (use more popular one)
|
27
|
-
text = 'signed, but not ratified' if text == 'signed but not ratified'
|
28
|
-
text = 'vectorborne diseases' if text == 'vectorborne disease'
|
29
|
-
text = 'water contact disease' if text == 'water contact diseases'
|
30
|
-
text = 'food or waterborne diseases' if text == 'food or waterborne disease'
|
31
|
-
text = 'geographic coordinates' if text == 'geographical coordinates'
|
32
|
-
text = 'note' if text == 'notes'
|
33
|
-
text = 'refugees (country of origin)' if text == 'refugees (countries of origin)'
|
34
|
-
|
35
|
-
## border countries (8): -- remove (x) counter
|
36
|
-
text = 'border countries' if text.start_with?( 'border countries')
|
37
|
-
|
38
|
-
text
|
39
|
-
end
|
40
|
-
|
41
|
-
|
42
|
-
end # module NormalizeHelper
|
43
|
-
end # module Factbook
|