factbook 2.0.0 → 2.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +1 -1
- data/Manifest.txt +0 -61
- data/README.md +8 -506
- data/Rakefile +4 -9
- data/lib/factbook.rb +4 -64
- metadata +6 -124
- data/data/attributes.yml +0 -337
- data/data/categories.csv +0 -164
- data/data/codes.csv +0 -262
- data/data/codesxref.csv +0 -280
- data/data/comparisons.csv +0 -75
- data/lib/factbook/almanac.rb +0 -72
- data/lib/factbook/attributes.rb +0 -74
- data/lib/factbook/builder.rb +0 -212
- data/lib/factbook/builder_item.rb +0 -126
- data/lib/factbook/builder_json.rb +0 -79
- data/lib/factbook/codes.rb +0 -119
- data/lib/factbook/comparisons.rb +0 -50
- data/lib/factbook/counter.rb +0 -48
- data/lib/factbook/db/importer.rb +0 -92
- data/lib/factbook/db/models.rb +0 -11
- data/lib/factbook/db/schema.rb +0 -36
- data/lib/factbook/normalize.rb +0 -43
- data/lib/factbook/page.rb +0 -148
- data/lib/factbook/page_info.rb +0 -12
- data/lib/factbook/reader_json.rb +0 -51
- data/lib/factbook/sanitizer.rb +0 -178
- data/lib/factbook/sect.rb +0 -29
- data/lib/factbook/subsect.rb +0 -18
- data/lib/factbook/table.rb +0 -52
- data/lib/factbook/utils.rb +0 -85
- data/lib/factbook/utils_info.rb +0 -129
- data/lib/factbook/version.rb +0 -21
- data/script/almanac.rb +0 -48
- data/script/attributes.rb +0 -34
- data/script/build.rb +0 -28
- data/script/counter.rb +0 -145
- data/script/json.rb +0 -19
- data/script/testbr.rb +0 -33
- data/script/testcodes.rb +0 -11
- data/test/data/au.html +0 -579
- data/test/data/au.yml +0 -8
- data/test/data/be.html +0 -596
- data/test/data/be.yml +0 -8
- data/test/data/json/au.json +0 -892
- data/test/data/src/ag.html +0 -716
- data/test/data/src/au-2015-09-24.html +0 -2006
- data/test/data/src/au.html +0 -658
- data/test/data/src/be-2015-09-24.html +0 -2011
- data/test/data/src/be.html +0 -648
- data/test/helper.rb +0 -11
- data/test/test_attribs.rb +0 -87
- data/test/test_attribs_def.rb +0 -20
- data/test/test_builder.rb +0 -35
- data/test/test_codes.rb +0 -76
- data/test/test_comparisons.rb +0 -19
- data/test/test_convert.rb +0 -30
- data/test/test_counter.rb +0 -31
- data/test/test_fields.rb +0 -52
- data/test/test_importer.rb +0 -56
- data/test/test_item_builder.rb +0 -99
- data/test/test_json.rb +0 -45
- data/test/test_json_builder.rb +0 -25
- data/test/test_normalize.rb +0 -23
- data/test/test_page.rb +0 -38
- data/test/test_sanitizer.rb +0 -39
- data/test/test_sanitizer_regex.rb +0 -89
@@ -1,79 +0,0 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
|
-
module Factbook
|
4
|
-
|
5
|
-
######
|
6
|
-
# json builder -- lets us rebuild a page from "dumped" json (instead of parsing html page)
|
7
|
-
|
8
|
-
class JsonBuilder
|
9
|
-
include LogUtils::Logging
|
10
|
-
include NormalizeHelper ## e.g. normalize_category
|
11
|
-
|
12
|
-
|
13
|
-
def self.from_file( path )
|
14
|
-
text = File.read( path ) ## fix: use File.read_utf8 from textutils
|
15
|
-
self.from_string( text )
|
16
|
-
end
|
17
|
-
|
18
|
-
def self.from_string( text )
|
19
|
-
self.new( text )
|
20
|
-
end
|
21
|
-
|
22
|
-
|
23
|
-
attr_reader :text,
|
24
|
-
:json,
|
25
|
-
:info, ## not used yet -- page info incl. country_name, region_name, last_updated etc.
|
26
|
-
:errors, ## not used yet -- encoding erros etc.
|
27
|
-
:sects
|
28
|
-
|
29
|
-
|
30
|
-
def initialize( text )
|
31
|
-
@text = text
|
32
|
-
|
33
|
-
@json = JSON.parse( text )
|
34
|
-
|
35
|
-
@info = nil ## fix/todo: sorry - for now no page info (use header in json - why? why not??)
|
36
|
-
@errors = [] ## fix/todo: sorry - for now no errors possible/tracked
|
37
|
-
|
38
|
-
@sects = []
|
39
|
-
|
40
|
-
@json.each do |k1,v1|
|
41
|
-
sect_title = k1
|
42
|
-
sect_subsects = v1
|
43
|
-
|
44
|
-
sect = Sect.new
|
45
|
-
sect.title = sect_title
|
46
|
-
|
47
|
-
## get subsections
|
48
|
-
subsects = []
|
49
|
-
sect_subsects.each do |k2,v2|
|
50
|
-
subsect_title = k2
|
51
|
-
subsect_data = v2
|
52
|
-
|
53
|
-
subsect = Subsect.new
|
54
|
-
subsect.title = subsect_title
|
55
|
-
|
56
|
-
#####
|
57
|
-
## note: run data hash through normalize_category (again)
|
58
|
-
if subsect_data.is_a?( Hash )
|
59
|
-
new_subsect_data = {}
|
60
|
-
subsect_data.each do |k3,v3|
|
61
|
-
new_subsect_data[ normalize_category(k3) ] = v3
|
62
|
-
end
|
63
|
-
subsect_data = new_subsect_data
|
64
|
-
end
|
65
|
-
|
66
|
-
subsect.data = subsect_data
|
67
|
-
|
68
|
-
subsects << subsect
|
69
|
-
end
|
70
|
-
|
71
|
-
sect.subsects = subsects
|
72
|
-
@sects << sect
|
73
|
-
end
|
74
|
-
end
|
75
|
-
|
76
|
-
end # class JsonBuilder
|
77
|
-
|
78
|
-
|
79
|
-
end # module Factbook
|
data/lib/factbook/codes.rb
DELETED
@@ -1,119 +0,0 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
|
-
##
|
4
|
-
# note:
|
5
|
-
# the factbook category/region for world is other entities (on FAQ) and oceans in page
|
6
|
-
# changed to world
|
7
|
-
|
8
|
-
|
9
|
-
module Factbook
|
10
|
-
|
11
|
-
class Codes
|
12
|
-
|
13
|
-
Code = Struct.new( :code, ## todo: add notes (country affiliation) - why? why not??
|
14
|
-
:name,
|
15
|
-
:category, ## e.g. Countries, Other, Oceans, World, Dependencies, etc.
|
16
|
-
:region, ## e.g. Europe, Oceans, etc.
|
17
|
-
)
|
18
|
-
|
19
|
-
def self.from_csv( path )
|
20
|
-
###
|
21
|
-
# note:
|
22
|
-
# if you use quotes - NO leading spaces allowed e.g.
|
23
|
-
# use au,"Austria",... and NOT
|
24
|
-
# au, "Austria", ...
|
25
|
-
#
|
26
|
-
# for headers - NO leading spaces allowed e.g.
|
27
|
-
# use Code,Name,Category,Region,... and NOT
|
28
|
-
# Code, Name, Category, Region, ...
|
29
|
-
|
30
|
-
rows = CSV.read( path, headers: true )
|
31
|
-
|
32
|
-
pp rows
|
33
|
-
|
34
|
-
recs = []
|
35
|
-
rows.each do |row|
|
36
|
-
pp row
|
37
|
-
rec = Code.new
|
38
|
-
rec.code = row['Code'].strip ## remove leading n trailing whitespaces
|
39
|
-
rec.name = row['Name'].strip
|
40
|
-
|
41
|
-
## note: for now category and region are optional
|
42
|
-
rec.category = row['Category'].strip if row['Category']
|
43
|
-
rec.region = row['Region'].strip if row['Region']
|
44
|
-
|
45
|
-
pp rec
|
46
|
-
recs << rec
|
47
|
-
end
|
48
|
-
|
49
|
-
self.new( recs )
|
50
|
-
end
|
51
|
-
|
52
|
-
def initialize( codes )
|
53
|
-
@codes = codes
|
54
|
-
end
|
55
|
-
|
56
|
-
def size() @codes.size; end
|
57
|
-
|
58
|
-
def each
|
59
|
-
@codes.each {|code| yield( code ) }
|
60
|
-
end
|
61
|
-
|
62
|
-
def to_a
|
63
|
-
@codes.collect {|code| code.code } ## return array of codes
|
64
|
-
end
|
65
|
-
|
66
|
-
## def all() self.to_a; end ## note: alias for to_a - use - why? why not??
|
67
|
-
|
68
|
-
## "pre-defined" convenience shortcuts
|
69
|
-
def countries() category 'Countries'; end
|
70
|
-
def world() category 'World'; end
|
71
|
-
def oceans() category 'Oceans'; end
|
72
|
-
def misc() category 'Miscellaneous'; end
|
73
|
-
def others() category 'Other'; end
|
74
|
-
def dependencies() category 'Dependencies'; end
|
75
|
-
def dependencies_us() category 'Dependencies (United States)'; end
|
76
|
-
## fix/todo: add all dependencies uk (or gb?), fr,cn,au,nz,no,dk,etc.
|
77
|
-
|
78
|
-
def europe() region 'Europe'; end
|
79
|
-
def south_asia() region 'South Asia'; end
|
80
|
-
def central_asia() region 'Central Asia'; end
|
81
|
-
def east_n_souteast_asia() region 'East & Southeast Asia'; end
|
82
|
-
def middle_east() region 'Middle East'; end
|
83
|
-
def africa() region 'Africa'; end
|
84
|
-
def north_america() region 'North America'; end
|
85
|
-
def central_america_n_caribbean() region 'Central America and Caribbean'; end
|
86
|
-
def south_america() region 'South America'; end
|
87
|
-
def australia_oceania() region 'Australia-Oceania'; end
|
88
|
-
def antartica() region 'Antarctica'; end
|
89
|
-
|
90
|
-
## note: regions oceans and world - same as category oceans and world
|
91
|
-
## use oceans_ii or world_ii or something ??
|
92
|
-
## use category('World') n region('World')
|
93
|
-
## use category('Oceans') n region('Oceans')
|
94
|
-
|
95
|
-
|
96
|
-
def category( query )
|
97
|
-
## todo/future: allow passing in of regex too (not just string)
|
98
|
-
## note: e.g. Dependencies (France) needs to get escpaed to
|
99
|
-
## Dependencies \(France\) etc.
|
100
|
-
filter_regex = /#{Regexp.escape(query)}/i
|
101
|
-
codes = @codes.select do |code|
|
102
|
-
code.category ? filter_regex.match( code.category ) : false ## note: allow nil for category; will fail on search
|
103
|
-
end
|
104
|
-
Codes.new( codes ) ## return new Codes obj for easy-chaining
|
105
|
-
end
|
106
|
-
|
107
|
-
def region( query )
|
108
|
-
## todo/future: allow passing in of regex too (not just string)
|
109
|
-
filter_regex = /#{Regexp.escape(query)}/i
|
110
|
-
codes = @codes.select do |code|
|
111
|
-
code.region ? filter_regex.match( code.region ) : false ## note: allow nil for region; will fail on search
|
112
|
-
end
|
113
|
-
Codes.new( codes ) ## return new Codes obj for easy-chaining
|
114
|
-
end
|
115
|
-
|
116
|
-
end # class codes
|
117
|
-
|
118
|
-
end # module Factbook
|
119
|
-
|
data/lib/factbook/comparisons.rb
DELETED
@@ -1,50 +0,0 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
|
-
module Factbook
|
4
|
-
|
5
|
-
class Comparisons
|
6
|
-
|
7
|
-
Comparison = Struct.new( :num, ### todo: use no or id or something - why? why not?
|
8
|
-
:category, ## e.g. Geography, People, Economy, etc.
|
9
|
-
:name,
|
10
|
-
)
|
11
|
-
|
12
|
-
def self.from_csv( path )
|
13
|
-
|
14
|
-
rows = CSV.read( path, headers: true )
|
15
|
-
|
16
|
-
pp rows
|
17
|
-
|
18
|
-
recs = []
|
19
|
-
rows.each do |row|
|
20
|
-
pp row
|
21
|
-
rec = Comparison.new
|
22
|
-
rec.num = row['Num'].strip.to_i ## remove leading n trailing whitespaces
|
23
|
-
rec.category = row['Category'].strip
|
24
|
-
rec.name = row['Name'].strip
|
25
|
-
|
26
|
-
pp rec
|
27
|
-
recs << rec
|
28
|
-
end
|
29
|
-
|
30
|
-
self.new( recs )
|
31
|
-
end
|
32
|
-
|
33
|
-
def initialize( comps )
|
34
|
-
@comps = comps
|
35
|
-
end
|
36
|
-
|
37
|
-
def size() @comps.size; end
|
38
|
-
|
39
|
-
def each
|
40
|
-
@comps.each {|comp| yield( comp ) }
|
41
|
-
end
|
42
|
-
|
43
|
-
def to_a
|
44
|
-
@comps.collect {|comp| comp.num } ## return array of nums -- return something else - why? why not?
|
45
|
-
end
|
46
|
-
|
47
|
-
end # class Comparison
|
48
|
-
|
49
|
-
end # module Factbook
|
50
|
-
|
data/lib/factbook/counter.rb
DELETED
@@ -1,48 +0,0 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
|
-
module Factbook
|
4
|
-
|
5
|
-
class Counter
|
6
|
-
|
7
|
-
attr_reader :data
|
8
|
-
|
9
|
-
def initialize
|
10
|
-
@data = {}
|
11
|
-
end
|
12
|
-
|
13
|
-
def count( page )
|
14
|
-
|
15
|
-
## walk page data hash
|
16
|
-
# add nodes to data
|
17
|
-
|
18
|
-
walk( page, page.data, @data )
|
19
|
-
end
|
20
|
-
|
21
|
-
|
22
|
-
private
|
23
|
-
def walk( page, hin, hout )
|
24
|
-
hin.each do |k,v|
|
25
|
-
if v.is_a? Hash
|
26
|
-
hout2 = hout[k] || { count: 0, codes: '' }
|
27
|
-
|
28
|
-
hout2[ :count ] += 1
|
29
|
-
|
30
|
-
## delete codes if larger (treshhold) than x (e.g. 9)
|
31
|
-
hout2.delete( :codes ) if hout2[ :count ] > 9
|
32
|
-
|
33
|
-
codes = hout2[ :codes ]
|
34
|
-
if codes ## note: might got deleted if passed treshhold (e.g. 9 entries)
|
35
|
-
codes << ' ' unless codes.empty? ## add separator (space for now)
|
36
|
-
codes << page.info.country_code
|
37
|
-
hout2[ :codes ] = codes
|
38
|
-
end
|
39
|
-
|
40
|
-
hout[k] = hout2
|
41
|
-
walk( page, v, hout2 )
|
42
|
-
end
|
43
|
-
end
|
44
|
-
end
|
45
|
-
|
46
|
-
end # class Counter
|
47
|
-
|
48
|
-
end # module Factbook
|
data/lib/factbook/db/importer.rb
DELETED
@@ -1,92 +0,0 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
|
-
module Factbook
|
4
|
-
|
5
|
-
class Importer
|
6
|
-
|
7
|
-
def import( page )
|
8
|
-
|
9
|
-
## note: assumes active connection
|
10
|
-
|
11
|
-
code = page.info.country_code
|
12
|
-
name = page.info.country_name
|
13
|
-
|
14
|
-
attribs = {
|
15
|
-
name: name,
|
16
|
-
area: sq_km( page.area ), # e.g. 83,871 sq km
|
17
|
-
area_land: sq_km( page.area_land ), # e.g. 82,445 sq km
|
18
|
-
area_water: sq_km( page.area_water ), # e.g. 1,426 sq km
|
19
|
-
|
20
|
-
population: num( page.population ), # e.g. 8,665,550 (July 2015 est.)
|
21
|
-
population_growth: percent( page.population_growth ), # e.g. 0.55% (2015 est.)
|
22
|
-
birth_rate: rate_per_thousand( page.birth_rate ), # e.g. 9.41 births/1,000 population (2015 est.)
|
23
|
-
death_rate: rate_per_thousand( page.death_rate ), # e.g. 9.42 deaths/1,000 population (2015 est.)
|
24
|
-
migration_rate: rate_per_thousand( page.migration_rate ), # e.g. 5.56 migrant(s)/1,000 population (2015 est.)
|
25
|
-
}
|
26
|
-
|
27
|
-
rec = Fact.find_by( code: code )
|
28
|
-
if rec.nil? ## create (new) record
|
29
|
-
rec = Fact.new
|
30
|
-
attribs[ :code ] = code
|
31
|
-
puts "create fact record #{code}/#{name}:"
|
32
|
-
else ## update (exisiting) record
|
33
|
-
puts "update fact record #{code}/#{name}:"
|
34
|
-
end
|
35
|
-
|
36
|
-
puts " #{attribs.inspect}"
|
37
|
-
rec.update_attributes!( attribs )
|
38
|
-
end
|
39
|
-
|
40
|
-
|
41
|
-
def rate_per_thousand( text )
|
42
|
-
# e.g. 9.41 births/1,000 population (2015 est.)
|
43
|
-
# 9.42 deaths/1,000 population (2015 est.)
|
44
|
-
# 5.56 migrant(s)/1,000 population (2015 est.)
|
45
|
-
|
46
|
-
if text =~/([0-9\.]+) [a-z\(\)]+\/1,000/
|
47
|
-
$1.to_f
|
48
|
-
else
|
49
|
-
puts "*** warn: unknown rate <name>/1,000 format (no match): >#{text}<"
|
50
|
-
nil
|
51
|
-
end
|
52
|
-
end
|
53
|
-
|
54
|
-
def num( text )
|
55
|
-
# e.g. 8,665,550 (July 2015 est.)
|
56
|
-
|
57
|
-
if text =~/([0-9,\.]+)/
|
58
|
-
$1.gsub(',', '').to_i ## note: remove commas (,) if present
|
59
|
-
else
|
60
|
-
puts "*** warn: unknown number format (no match): >#{text}<"
|
61
|
-
nil ## return nil
|
62
|
-
end
|
63
|
-
end
|
64
|
-
|
65
|
-
def percent( text )
|
66
|
-
# e.g. 0.55% (2015 est.)
|
67
|
-
|
68
|
-
if text =~/([0-9\.]+)%/
|
69
|
-
$1.to_f
|
70
|
-
else
|
71
|
-
puts "*** warn: unknown percent format (no match): >#{text}<"
|
72
|
-
nil ## return nil
|
73
|
-
end
|
74
|
-
end
|
75
|
-
|
76
|
-
def sq_km( text )
|
77
|
-
# e.g. 83,871 sq km
|
78
|
-
## todo - check vatican - uses float e.g. 0.44 ?? add support?
|
79
|
-
|
80
|
-
if text =~/([0-9,\.]+) sq km/
|
81
|
-
$1.gsub(',', '').to_i ## note: remove commas (,) if present
|
82
|
-
else
|
83
|
-
puts "*** warn: unknown sq km format (no match): >#{text}<"
|
84
|
-
nil ## return nil
|
85
|
-
end
|
86
|
-
end
|
87
|
-
|
88
|
-
|
89
|
-
end # class Importer
|
90
|
-
|
91
|
-
end # module Factbook
|
92
|
-
|
data/lib/factbook/db/models.rb
DELETED
data/lib/factbook/db/schema.rb
DELETED
@@ -1,36 +0,0 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
|
-
module Factbook
|
4
|
-
|
5
|
-
class CreateDb
|
6
|
-
|
7
|
-
def up
|
8
|
-
|
9
|
-
ActiveRecord::Schema.define do
|
10
|
-
|
11
|
-
create_table :facts do |t|
|
12
|
-
t.string :code, null: false # country code e.g. au
|
13
|
-
t.string :name, null: false # country name e.g. Austria
|
14
|
-
|
15
|
-
t.integer :area # e.g. 83,871 sq km
|
16
|
-
t.integer :area_land # e.g. 82,445 sq km --use float - why? why not?
|
17
|
-
t.integer :area_water # e.g. 1,426 sq km
|
18
|
-
|
19
|
-
t.integer :population # e.g. 8,665,550 (July 2015 est.)
|
20
|
-
t.float :population_growth # e.g. 0.55% (2015 est.)
|
21
|
-
t.float :birth_rate # e.g. 9.41 births/1,000 population (2015 est.)
|
22
|
-
t.float :death_rate # e.g. 9.42 deaths/1,000 population (2015 est.)
|
23
|
-
t.float :migration_rate # e.g. 5.56 migrant(s)/1,000 population (2015 est.)
|
24
|
-
|
25
|
-
t.timestamps
|
26
|
-
end
|
27
|
-
|
28
|
-
|
29
|
-
end # block Schema.define
|
30
|
-
|
31
|
-
end # method up
|
32
|
-
|
33
|
-
|
34
|
-
end # class CreateDb
|
35
|
-
|
36
|
-
end # module Factbook
|
data/lib/factbook/normalize.rb
DELETED
@@ -1,43 +0,0 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
|
-
module Factbook
|
4
|
-
module NormalizeHelper
|
5
|
-
|
6
|
-
|
7
|
-
def normalize_category( text )
|
8
|
-
|
9
|
-
## note: fix typos/errors with double colons e.g. note:: (instead of note:)
|
10
|
-
|
11
|
-
text = text.strip
|
12
|
-
text = text.sub( /:+\z/, '' ) # remove trailing : if present -- note: allow (fix) note:: too, thus, use :+
|
13
|
-
text = text.strip
|
14
|
-
|
15
|
-
#######################################
|
16
|
-
### special cases
|
17
|
-
|
18
|
-
## typos e.g ntoe => use note
|
19
|
-
text = 'note' if text == 'ntoe'
|
20
|
-
text = 'investment in fixed capital' if text == 'investment if fixed capital'
|
21
|
-
|
22
|
-
## downcase
|
23
|
-
text = 'lowest point' if text == 'Lowest point'
|
24
|
-
text = 'chief of state' if text == 'Chief of state'
|
25
|
-
|
26
|
-
## spelling variant (use more popular one)
|
27
|
-
text = 'signed, but not ratified' if text == 'signed but not ratified'
|
28
|
-
text = 'vectorborne diseases' if text == 'vectorborne disease'
|
29
|
-
text = 'water contact disease' if text == 'water contact diseases'
|
30
|
-
text = 'food or waterborne diseases' if text == 'food or waterborne disease'
|
31
|
-
text = 'geographic coordinates' if text == 'geographical coordinates'
|
32
|
-
text = 'note' if text == 'notes'
|
33
|
-
text = 'refugees (country of origin)' if text == 'refugees (countries of origin)'
|
34
|
-
|
35
|
-
## border countries (8): -- remove (x) counter
|
36
|
-
text = 'border countries' if text.start_with?( 'border countries')
|
37
|
-
|
38
|
-
text
|
39
|
-
end
|
40
|
-
|
41
|
-
|
42
|
-
end # module NormalizeHelper
|
43
|
-
end # module Factbook
|