factbook 2.0.0 → 2.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +1 -1
- data/Manifest.txt +0 -61
- data/README.md +8 -506
- data/Rakefile +4 -9
- data/lib/factbook.rb +4 -64
- metadata +6 -124
- data/data/attributes.yml +0 -337
- data/data/categories.csv +0 -164
- data/data/codes.csv +0 -262
- data/data/codesxref.csv +0 -280
- data/data/comparisons.csv +0 -75
- data/lib/factbook/almanac.rb +0 -72
- data/lib/factbook/attributes.rb +0 -74
- data/lib/factbook/builder.rb +0 -212
- data/lib/factbook/builder_item.rb +0 -126
- data/lib/factbook/builder_json.rb +0 -79
- data/lib/factbook/codes.rb +0 -119
- data/lib/factbook/comparisons.rb +0 -50
- data/lib/factbook/counter.rb +0 -48
- data/lib/factbook/db/importer.rb +0 -92
- data/lib/factbook/db/models.rb +0 -11
- data/lib/factbook/db/schema.rb +0 -36
- data/lib/factbook/normalize.rb +0 -43
- data/lib/factbook/page.rb +0 -148
- data/lib/factbook/page_info.rb +0 -12
- data/lib/factbook/reader_json.rb +0 -51
- data/lib/factbook/sanitizer.rb +0 -178
- data/lib/factbook/sect.rb +0 -29
- data/lib/factbook/subsect.rb +0 -18
- data/lib/factbook/table.rb +0 -52
- data/lib/factbook/utils.rb +0 -85
- data/lib/factbook/utils_info.rb +0 -129
- data/lib/factbook/version.rb +0 -21
- data/script/almanac.rb +0 -48
- data/script/attributes.rb +0 -34
- data/script/build.rb +0 -28
- data/script/counter.rb +0 -145
- data/script/json.rb +0 -19
- data/script/testbr.rb +0 -33
- data/script/testcodes.rb +0 -11
- data/test/data/au.html +0 -579
- data/test/data/au.yml +0 -8
- data/test/data/be.html +0 -596
- data/test/data/be.yml +0 -8
- data/test/data/json/au.json +0 -892
- data/test/data/src/ag.html +0 -716
- data/test/data/src/au-2015-09-24.html +0 -2006
- data/test/data/src/au.html +0 -658
- data/test/data/src/be-2015-09-24.html +0 -2011
- data/test/data/src/be.html +0 -648
- data/test/helper.rb +0 -11
- data/test/test_attribs.rb +0 -87
- data/test/test_attribs_def.rb +0 -20
- data/test/test_builder.rb +0 -35
- data/test/test_codes.rb +0 -76
- data/test/test_comparisons.rb +0 -19
- data/test/test_convert.rb +0 -30
- data/test/test_counter.rb +0 -31
- data/test/test_fields.rb +0 -52
- data/test/test_importer.rb +0 -56
- data/test/test_item_builder.rb +0 -99
- data/test/test_json.rb +0 -45
- data/test/test_json_builder.rb +0 -25
- data/test/test_normalize.rb +0 -23
- data/test/test_page.rb +0 -38
- data/test/test_sanitizer.rb +0 -39
- data/test/test_sanitizer_regex.rb +0 -89
data/lib/factbook/utils.rb
DELETED
@@ -1,85 +0,0 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
|
-
module Factbook
|
4
|
-
module Utils
|
5
|
-
|
6
|
-
########################################
|
7
|
-
## todo: move to textutils - why, why not ?????
|
8
|
-
|
9
|
-
def encode_utf8( text )
|
10
|
-
|
11
|
-
errors = [] ## also return list of encoding errors
|
12
|
-
|
13
|
-
## note: factbook claims utf-8 - but includes invalid bytes in some pages
|
14
|
-
## encoding is likley wester/windows-
|
15
|
-
|
16
|
-
## note:
|
17
|
-
## use � - unknown/invalid unicode char
|
18
|
-
## fix/todo: use ASCII-8BIT instead of binnary
|
19
|
-
text = text.encode('UTF-8', 'binary', :invalid => :replace,
|
20
|
-
:undef => :replace,
|
21
|
-
:replace => '�' )
|
22
|
-
|
23
|
-
## check for replaced/invalid chars and log warrning
|
24
|
-
pos = text.index( '�' )
|
25
|
-
while pos
|
26
|
-
from = pos-10 ## tood/fix: use min/max to check for bounds - why? why not??
|
27
|
-
to = pos+10
|
28
|
-
around = text[from..to]
|
29
|
-
puts " pos #{pos}, from #{from}, to #{to}, around >#{around}<"
|
30
|
-
msg = "invalid char on pos #{pos} around: >#{around}<"
|
31
|
-
puts msg
|
32
|
-
## also log message / w timestamp
|
33
|
-
|
34
|
-
errors << "#{Time.now} - #{msg}"
|
35
|
-
|
36
|
-
pos = text.index( '�', pos+1 )
|
37
|
-
end
|
38
|
-
|
39
|
-
[text,errors] ## return text and errors (list)
|
40
|
-
end
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
def values_to_csv( values )
|
45
|
-
buf = ""
|
46
|
-
values.each_with_index do |value,i|
|
47
|
-
buf << ',' if i > 0 ## add comma (except for first value)
|
48
|
-
## note: allow optional $ sign e.g. $100,000,000
|
49
|
-
## !!!! todo/fix: allow optional minus e.g. -44,000
|
50
|
-
if value =~ /^\$?[1-9][,0-9]+[0-9]$/ ### find a better regex - why? why not??
|
51
|
-
## check if number e.g. 17,098,242 or $17,098,242
|
52
|
-
## remove commas 17098242
|
53
|
-
buf << value.gsub( ',', '' )
|
54
|
-
elsif value.index( ',').nil?
|
55
|
-
## add as is 1:1 (no commana)
|
56
|
-
buf << value
|
57
|
-
else
|
58
|
-
## escape comma with double quote
|
59
|
-
# e.g. Guam, The becomes "Guam, The"
|
60
|
-
buf << '"'
|
61
|
-
buf << value
|
62
|
-
buf << '"'
|
63
|
-
end
|
64
|
-
end
|
65
|
-
buf
|
66
|
-
end
|
67
|
-
|
68
|
-
|
69
|
-
def data_to_csv( recs, headers )
|
70
|
-
text = ""
|
71
|
-
|
72
|
-
text << values_to_csv( headers )
|
73
|
-
text << "\n"
|
74
|
-
|
75
|
-
recs.each do |rec|
|
76
|
-
text << values_to_csv( rec )
|
77
|
-
text << "\n"
|
78
|
-
end
|
79
|
-
|
80
|
-
text
|
81
|
-
end
|
82
|
-
|
83
|
-
|
84
|
-
end # module Utils
|
85
|
-
end # module Factbook
|
data/lib/factbook/utils_info.rb
DELETED
@@ -1,129 +0,0 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
|
-
module Factbook
|
4
|
-
module Utils
|
5
|
-
|
6
|
-
#######
|
7
|
-
## find meta data (about page info)
|
8
|
-
|
9
|
-
|
10
|
-
#### e.g. Page last updated on September 16, 2015
|
11
|
-
|
12
|
-
MONTH_EN_TO_S={
|
13
|
-
'January' => '1',
|
14
|
-
'February' => '2',
|
15
|
-
'March' => '3',
|
16
|
-
'April' => '4',
|
17
|
-
'May' => '5',
|
18
|
-
'June' => '6',
|
19
|
-
'July' => '7',
|
20
|
-
'August' => '8',
|
21
|
-
'September' => '9',
|
22
|
-
'October' => '10',
|
23
|
-
'November' => '11',
|
24
|
-
'December' => '12'
|
25
|
-
}
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
##
|
30
|
-
# examples (to match):
|
31
|
-
# Page last updated on November 03, 2016
|
32
|
-
# Page last updated on September 24, 2015
|
33
|
-
|
34
|
-
PAGE_LAST_UPDATED_REGEX = /
|
35
|
-
Page \s last \s updated \s on \s
|
36
|
-
(?<month_en>[a-z]+) \s
|
37
|
-
(?<day>\d{1,2}), \s
|
38
|
-
(?<year>\d{4})
|
39
|
-
/imx
|
40
|
-
|
41
|
-
def find_page_last_updated( html )
|
42
|
-
m = PAGE_LAST_UPDATED_REGEX.match( html )
|
43
|
-
if m
|
44
|
-
pp m
|
45
|
-
month_en = m[:month_en]
|
46
|
-
day = m[:day]
|
47
|
-
year = m[:year]
|
48
|
-
puts "** bingo - month #{month_en}, day #{day}, year #{year}"
|
49
|
-
|
50
|
-
month = MONTH_EN_TO_S[ month_en ]
|
51
|
-
date_str = "#{year}-#{month}-#{day}"
|
52
|
-
pp date_str
|
53
|
-
date = Date.strptime( date_str, '%Y-%m-%d' )
|
54
|
-
date
|
55
|
-
else
|
56
|
-
nil
|
57
|
-
end
|
58
|
-
end
|
59
|
-
|
60
|
-
|
61
|
-
## fallback: find "standalone" country coude
|
62
|
-
## e.g.
|
63
|
-
## ccode='au'
|
64
|
-
|
65
|
-
COUNTRY_CODE_REGEX = /ccode='(?<cc>[a-z]+)'/
|
66
|
-
|
67
|
-
def find_country_code( html )
|
68
|
-
m = COUNTRY_CODE_REGEX.match( html )
|
69
|
-
if m
|
70
|
-
pp m
|
71
|
-
cc = m[:cc]
|
72
|
-
puts "** bingo - country code #{cc}"
|
73
|
-
cc
|
74
|
-
else
|
75
|
-
nil
|
76
|
-
end
|
77
|
-
end
|
78
|
-
|
79
|
-
|
80
|
-
##
|
81
|
-
## e.g. regioncode="eur"
|
82
|
-
## countrycode="au"
|
83
|
-
## countryname="Austria"
|
84
|
-
## flagsubfield=""
|
85
|
-
## countryaffiliation=""
|
86
|
-
## flagdescription=""
|
87
|
-
## flagdescriptionnote=""
|
88
|
-
## region="Europe"
|
89
|
-
##
|
90
|
-
## note: countryaffiliation may be empty
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
PAGE_INFO_REGEX = /
|
95
|
-
regioncode=(?<q1>"|')(?<region_code>.+?)\k<q1>
|
96
|
-
\s+
|
97
|
-
countrycode=(?<q2>"|')(?<country_code>.+?)\k<q2> ## is k<3> backref
|
98
|
-
\s+
|
99
|
-
countryname=(?<q3>"|')(?<country>.+?)\k<q3>
|
100
|
-
\s+
|
101
|
-
[^>]+? ## allow any attribs (note: non-greedy)
|
102
|
-
countryaffiliation=(?<q4>"|')(?<affiliation>.*?)\k<q4> ## note: might be empty
|
103
|
-
\s+
|
104
|
-
[^>]+? ## allow any attribs (note: non-greedy)
|
105
|
-
region=(?<q5>"|')(?<region>.+?)\k<q5> ## check world - might be empty ?? or for ocean ??
|
106
|
-
/imx
|
107
|
-
|
108
|
-
|
109
|
-
def find_page_info( html )
|
110
|
-
m = PAGE_INFO_REGEX.match( html )
|
111
|
-
if m
|
112
|
-
pp m
|
113
|
-
|
114
|
-
h = { country_code: m[:country_code],
|
115
|
-
country_name: m[:country],
|
116
|
-
country_affiliation: m[:affiliation],
|
117
|
-
region_code: m[:region_code],
|
118
|
-
region_name: m[:region] }
|
119
|
-
|
120
|
-
puts "** bingo - #{h.inspect}"
|
121
|
-
h ## return hash w/ name-value pairs
|
122
|
-
else
|
123
|
-
nil ## or return empty struct with nils/empty strings - why?? why not??
|
124
|
-
end
|
125
|
-
end
|
126
|
-
|
127
|
-
|
128
|
-
end # module Utils
|
129
|
-
end # module Factbook
|
data/lib/factbook/version.rb
DELETED
@@ -1,21 +0,0 @@
|
|
1
|
-
|
2
|
-
module Factbook
|
3
|
-
|
4
|
-
MAJOR = 2
|
5
|
-
MINOR = 0
|
6
|
-
PATCH = 0
|
7
|
-
VERSION = [MAJOR,MINOR,PATCH].join('.')
|
8
|
-
|
9
|
-
def self.version
|
10
|
-
VERSION
|
11
|
-
end
|
12
|
-
|
13
|
-
def self.banner
|
14
|
-
"factbook/#{VERSION} on Ruby #{RUBY_VERSION} (#{RUBY_RELEASE_DATE}) [#{RUBY_PLATFORM}]"
|
15
|
-
end
|
16
|
-
|
17
|
-
def self.root
|
18
|
-
File.expand_path( File.dirname(File.dirname(File.dirname(__FILE__))) )
|
19
|
-
end
|
20
|
-
|
21
|
-
end
|
data/script/almanac.rb
DELETED
@@ -1,48 +0,0 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
#
|
3
|
-
# use to run:
|
4
|
-
# ruby -I ./lib script/almanac.rb
|
5
|
-
|
6
|
-
|
7
|
-
require 'factbook'
|
8
|
-
|
9
|
-
|
10
|
-
TEMPLATE = <<EOS
|
11
|
-
|
12
|
-
### <%= names %>
|
13
|
-
|
14
|
-
<%= page.name_long=='none' ? '\-' : page.name_long %> › <%= page.map %> -- <%= page.location %> <br>
|
15
|
-
<%= page.capital %> • <%= page.area %> • pop. <%= page.population %>
|
16
|
-
|
17
|
-
**Languages:** <%= page.languages %>
|
18
|
-
**Major cities:** <%= page.major_cities %>
|
19
|
-
**Ethnic groups:** <%= page.ethnic_groups %>
|
20
|
-
**Religions:** <%= page.religions %>
|
21
|
-
**Independence:** <%= page.independence %>
|
22
|
-
|
23
|
-
**Internet:** `<%= page.internet %>` • <%= page.internet_users %> • <%= page.internet_users_rate %>
|
24
|
-
**Telephones - mobile:** <%= page.telephones_mobile %> • <%= page.telephones_mobile_subscriptions %> subs./100
|
25
|
-
|
26
|
-
EOS
|
27
|
-
|
28
|
-
|
29
|
-
#########################
|
30
|
-
### read all countries
|
31
|
-
### using local json (dump) files
|
32
|
-
|
33
|
-
## see github.com/factbook/factbook.json (use git clone)
|
34
|
-
json_dir = '../../factbook/factbook.json'
|
35
|
-
codes = Factbook.codes.countries
|
36
|
-
## todo: add tawain and ?? from others - why, why not??
|
37
|
-
|
38
|
-
pages = Factbook::JsonPageReader.new( json_dir ).read_pages( codes )
|
39
|
-
|
40
|
-
almanac = Factbook::Almanac.new( pages )
|
41
|
-
|
42
|
-
## save to disk
|
43
|
-
|
44
|
-
File.open( './tmp/ALMANAC.md', 'w' ) do |f|
|
45
|
-
f.write almanac.render( TEMPLATE )
|
46
|
-
end
|
47
|
-
|
48
|
-
puts "Done."
|
data/script/attributes.rb
DELETED
@@ -1,34 +0,0 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
#
|
3
|
-
# use to run:
|
4
|
-
# ruby -I ./lib script/attributes.rb
|
5
|
-
|
6
|
-
|
7
|
-
# e.g. prints attribute accessor shortcuts
|
8
|
-
#
|
9
|
-
# ### Geography
|
10
|
-
#
|
11
|
-
# - `location` => Location
|
12
|
-
# - `coords` => Geographic coordinates
|
13
|
-
# - `map` => Map references
|
14
|
-
# ...
|
15
|
-
|
16
|
-
require 'factbook'
|
17
|
-
|
18
|
-
|
19
|
-
attribs= Factbook.attributes.to_a
|
20
|
-
|
21
|
-
h = attribs.group_by { |a| a.category }
|
22
|
-
|
23
|
-
pp h
|
24
|
-
|
25
|
-
h.each do |k,v|
|
26
|
-
puts ""
|
27
|
-
puts "### #{k}"
|
28
|
-
puts ""
|
29
|
-
|
30
|
-
v.each do |a|
|
31
|
-
puts "- `#{a.name}` => #{a.path.join(' › ')}"
|
32
|
-
end
|
33
|
-
end
|
34
|
-
|
data/script/build.rb
DELETED
@@ -1,28 +0,0 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
#
|
3
|
-
# use to run/test:
|
4
|
-
# ruby -I ./lib script/build.rb
|
5
|
-
|
6
|
-
require 'factbook'
|
7
|
-
|
8
|
-
DB_CONFIG = {
|
9
|
-
adapter: 'sqlite3',
|
10
|
-
database: './factbook.db'
|
11
|
-
}
|
12
|
-
|
13
|
-
ActiveRecord::Base.logger = Logger.new( STDOUT )
|
14
|
-
ActiveRecord::Base.establish_connection( DB_CONFIG )
|
15
|
-
|
16
|
-
Factbook::CreateDb.new.up ## create tables
|
17
|
-
|
18
|
-
importer = Factbook::Importer.new
|
19
|
-
|
20
|
-
Factbook.codes.each do |code|
|
21
|
-
puts "Fetching #{code.code}- #{code.name}..."
|
22
|
-
page = Factbook::Page.new( code.code )
|
23
|
-
|
24
|
-
puts "Adding #{code.code}- #{code.name}..."
|
25
|
-
importer.import( page )
|
26
|
-
end
|
27
|
-
|
28
|
-
puts "Done."
|
data/script/counter.rb
DELETED
@@ -1,145 +0,0 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
#
|
3
|
-
# use to run:
|
4
|
-
# ruby -I ./lib script/counter.rb
|
5
|
-
|
6
|
-
require 'factbook'
|
7
|
-
|
8
|
-
|
9
|
-
c = Factbook::Counter.new
|
10
|
-
|
11
|
-
## see github.com/factbook/factbook.json (use git clone)
|
12
|
-
json_dir = '../../factbook/factbook.json'
|
13
|
-
codes = Factbook.codes
|
14
|
-
|
15
|
-
pages = Factbook::JsonPageReader.new( json_dir ).read_pages( codes )
|
16
|
-
|
17
|
-
pages.each do |page|
|
18
|
-
c.count( page )
|
19
|
-
end
|
20
|
-
|
21
|
-
h = c.data
|
22
|
-
pp h
|
23
|
-
|
24
|
-
### save to json
|
25
|
-
puts "saving a copy to categories.json for debugging"
|
26
|
-
File.open( "tmp/categories.json", 'w' ) do |f|
|
27
|
-
f.write JSON.pretty_generate( h )
|
28
|
-
end
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
SKIP_CATEGORIES_LINES=<<EOS
|
33
|
-
|
34
|
-
######
|
35
|
-
### france plus 5 overseas regions/departments
|
36
|
-
|
37
|
-
## metropolitan France
|
38
|
-
## metropolitan France - total
|
39
|
-
overseas departments
|
40
|
-
French Guiana
|
41
|
-
French Guiana - total
|
42
|
-
Guadeloupe
|
43
|
-
Guadeloupe and Martinique
|
44
|
-
Martinique
|
45
|
-
Mayotte
|
46
|
-
Reunion
|
47
|
-
|
48
|
-
|
49
|
-
###############
|
50
|
-
### more
|
51
|
-
|
52
|
-
Iles Eparses
|
53
|
-
Ile Amsterdam
|
54
|
-
Ile Amsterdam (Ile Amsterdam et Ile Saint-Paul)
|
55
|
-
Ile Amsterdam et Ile Saint-Paul
|
56
|
-
Ile Saint Paul
|
57
|
-
Ile Saint-Paul (Ile Amsterdam et Ile Saint-Paul)
|
58
|
-
Iles Crozet
|
59
|
-
Iles Kerguelen
|
60
|
-
Adelie Land
|
61
|
-
Bassas da India
|
62
|
-
Bassas da India (Iles Eparses)
|
63
|
-
Bassas da India, Europa Island, Glorioso Islands, Juan de Nova Island (Iles Eparses)
|
64
|
-
Europa Island
|
65
|
-
Europa Island (Iles Eparses)
|
66
|
-
Europa Island, Glorioso Islands, Juan de Nova Island
|
67
|
-
Europa Island and Juan de Nova Island (Iles Eparses)
|
68
|
-
Europa Island, Glorioso Islands, Juan de Nova Island (Iles Eparses)
|
69
|
-
Glorioso Islands
|
70
|
-
Glorioso Islands (Iles Eparses)
|
71
|
-
Glorioso Island (Iles Eparses)
|
72
|
-
Juan de Nova Island
|
73
|
-
Juan de Nova Island (Iles Eparses)
|
74
|
-
Tromelin Island
|
75
|
-
Tromelin Island (Iles Eparses)
|
76
|
-
Saint Helena
|
77
|
-
Ascension Island
|
78
|
-
Ascension
|
79
|
-
Tristan da Cunha
|
80
|
-
Tristan da Cunha island group
|
81
|
-
Baker Island
|
82
|
-
Baker, Howland, and Jarvis Islands
|
83
|
-
Baker, Howland, and Jarvis Islands, and Johnston Atoll
|
84
|
-
Baker, Howland, and Jarvis Islands, and Kingman Reef
|
85
|
-
Howland Island
|
86
|
-
Jarvis Island
|
87
|
-
Johnston Atoll
|
88
|
-
Johnston Atoll and Kingman Reef
|
89
|
-
Kingman Reef
|
90
|
-
Midway Islands
|
91
|
-
Midway Islands, Johnston, and Palmyra Atolls
|
92
|
-
Midway Islands and Palmyra Atoll
|
93
|
-
Palmyra Atoll
|
94
|
-
note on Palmyra Atoll
|
95
|
-
EOS
|
96
|
-
|
97
|
-
## allow empty lines and skip comments
|
98
|
-
SKIP_CATEGORIES = SKIP_CATEGORIES_LINES.split("\n").select { |item| !(item =~ /^\s*$/ || item =~ /^\s*#/) }
|
99
|
-
|
100
|
-
|
101
|
-
def print_categories( data )
|
102
|
-
data.each do |k,v|
|
103
|
-
|
104
|
-
puts ""
|
105
|
-
puts "## #{k} _(#{v[:count]})_"
|
106
|
-
puts ""
|
107
|
-
|
108
|
-
walk_categories( v, 1 )
|
109
|
-
end
|
110
|
-
end
|
111
|
-
|
112
|
-
def walk_categories( data, level )
|
113
|
-
data.each do |k,v|
|
114
|
-
next if k == :count || k == :codes ## skip "virtual" count entry (added for stats)
|
115
|
-
|
116
|
-
## skip (sub)country entries e.g. Baker Island, Ile Amsterdam, etc.
|
117
|
-
next if SKIP_CATEGORIES.include?( k )
|
118
|
-
|
119
|
-
print " " * (level-1) if level > 1 ## add 4 spaces indents per extra level
|
120
|
-
print "- "
|
121
|
-
|
122
|
-
print "**" if level == 1 ## mark as bold
|
123
|
-
print k
|
124
|
-
print "**" if level == 1
|
125
|
-
|
126
|
-
print " _("
|
127
|
-
print v[:count]
|
128
|
-
if v[:codes] ## add codes if present
|
129
|
-
print " - "
|
130
|
-
print v[:codes]
|
131
|
-
end
|
132
|
-
print ")_"
|
133
|
-
|
134
|
-
print "\n"
|
135
|
-
|
136
|
-
walk_categories( v, level+1)
|
137
|
-
end
|
138
|
-
end
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
print_categories( c.data )
|
143
|
-
|
144
|
-
puts "Done."
|
145
|
-
|