factbook 2.0.0 → 2.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +1 -1
- data/Manifest.txt +0 -61
- data/README.md +8 -506
- data/Rakefile +4 -9
- data/lib/factbook.rb +4 -64
- metadata +6 -124
- data/data/attributes.yml +0 -337
- data/data/categories.csv +0 -164
- data/data/codes.csv +0 -262
- data/data/codesxref.csv +0 -280
- data/data/comparisons.csv +0 -75
- data/lib/factbook/almanac.rb +0 -72
- data/lib/factbook/attributes.rb +0 -74
- data/lib/factbook/builder.rb +0 -212
- data/lib/factbook/builder_item.rb +0 -126
- data/lib/factbook/builder_json.rb +0 -79
- data/lib/factbook/codes.rb +0 -119
- data/lib/factbook/comparisons.rb +0 -50
- data/lib/factbook/counter.rb +0 -48
- data/lib/factbook/db/importer.rb +0 -92
- data/lib/factbook/db/models.rb +0 -11
- data/lib/factbook/db/schema.rb +0 -36
- data/lib/factbook/normalize.rb +0 -43
- data/lib/factbook/page.rb +0 -148
- data/lib/factbook/page_info.rb +0 -12
- data/lib/factbook/reader_json.rb +0 -51
- data/lib/factbook/sanitizer.rb +0 -178
- data/lib/factbook/sect.rb +0 -29
- data/lib/factbook/subsect.rb +0 -18
- data/lib/factbook/table.rb +0 -52
- data/lib/factbook/utils.rb +0 -85
- data/lib/factbook/utils_info.rb +0 -129
- data/lib/factbook/version.rb +0 -21
- data/script/almanac.rb +0 -48
- data/script/attributes.rb +0 -34
- data/script/build.rb +0 -28
- data/script/counter.rb +0 -145
- data/script/json.rb +0 -19
- data/script/testbr.rb +0 -33
- data/script/testcodes.rb +0 -11
- data/test/data/au.html +0 -579
- data/test/data/au.yml +0 -8
- data/test/data/be.html +0 -596
- data/test/data/be.yml +0 -8
- data/test/data/json/au.json +0 -892
- data/test/data/src/ag.html +0 -716
- data/test/data/src/au-2015-09-24.html +0 -2006
- data/test/data/src/au.html +0 -658
- data/test/data/src/be-2015-09-24.html +0 -2011
- data/test/data/src/be.html +0 -648
- data/test/helper.rb +0 -11
- data/test/test_attribs.rb +0 -87
- data/test/test_attribs_def.rb +0 -20
- data/test/test_builder.rb +0 -35
- data/test/test_codes.rb +0 -76
- data/test/test_comparisons.rb +0 -19
- data/test/test_convert.rb +0 -30
- data/test/test_counter.rb +0 -31
- data/test/test_fields.rb +0 -52
- data/test/test_importer.rb +0 -56
- data/test/test_item_builder.rb +0 -99
- data/test/test_json.rb +0 -45
- data/test/test_json_builder.rb +0 -25
- data/test/test_normalize.rb +0 -23
- data/test/test_page.rb +0 -38
- data/test/test_sanitizer.rb +0 -39
- data/test/test_sanitizer_regex.rb +0 -89
data/lib/factbook/utils.rb
DELETED
@@ -1,85 +0,0 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
|
-
module Factbook
|
4
|
-
module Utils
|
5
|
-
|
6
|
-
########################################
|
7
|
-
## todo: move to textutils - why, why not ?????
|
8
|
-
|
9
|
-
def encode_utf8( text )
|
10
|
-
|
11
|
-
errors = [] ## also return list of encoding errors
|
12
|
-
|
13
|
-
## note: factbook claims utf-8 - but includes invalid bytes in some pages
|
14
|
-
## encoding is likley wester/windows-
|
15
|
-
|
16
|
-
## note:
|
17
|
-
## use � - unknown/invalid unicode char
|
18
|
-
## fix/todo: use ASCII-8BIT instead of binnary
|
19
|
-
text = text.encode('UTF-8', 'binary', :invalid => :replace,
|
20
|
-
:undef => :replace,
|
21
|
-
:replace => '�' )
|
22
|
-
|
23
|
-
## check for replaced/invalid chars and log warrning
|
24
|
-
pos = text.index( '�' )
|
25
|
-
while pos
|
26
|
-
from = pos-10 ## tood/fix: use min/max to check for bounds - why? why not??
|
27
|
-
to = pos+10
|
28
|
-
around = text[from..to]
|
29
|
-
puts " pos #{pos}, from #{from}, to #{to}, around >#{around}<"
|
30
|
-
msg = "invalid char on pos #{pos} around: >#{around}<"
|
31
|
-
puts msg
|
32
|
-
## also log message / w timestamp
|
33
|
-
|
34
|
-
errors << "#{Time.now} - #{msg}"
|
35
|
-
|
36
|
-
pos = text.index( '�', pos+1 )
|
37
|
-
end
|
38
|
-
|
39
|
-
[text,errors] ## return text and errors (list)
|
40
|
-
end
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
def values_to_csv( values )
|
45
|
-
buf = ""
|
46
|
-
values.each_with_index do |value,i|
|
47
|
-
buf << ',' if i > 0 ## add comma (except for first value)
|
48
|
-
## note: allow optional $ sign e.g. $100,000,000
|
49
|
-
## !!!! todo/fix: allow optional minus e.g. -44,000
|
50
|
-
if value =~ /^\$?[1-9][,0-9]+[0-9]$/ ### find a better regex - why? why not??
|
51
|
-
## check if number e.g. 17,098,242 or $17,098,242
|
52
|
-
## remove commas 17098242
|
53
|
-
buf << value.gsub( ',', '' )
|
54
|
-
elsif value.index( ',').nil?
|
55
|
-
## add as is 1:1 (no commana)
|
56
|
-
buf << value
|
57
|
-
else
|
58
|
-
## escape comma with double quote
|
59
|
-
# e.g. Guam, The becomes "Guam, The"
|
60
|
-
buf << '"'
|
61
|
-
buf << value
|
62
|
-
buf << '"'
|
63
|
-
end
|
64
|
-
end
|
65
|
-
buf
|
66
|
-
end
|
67
|
-
|
68
|
-
|
69
|
-
def data_to_csv( recs, headers )
|
70
|
-
text = ""
|
71
|
-
|
72
|
-
text << values_to_csv( headers )
|
73
|
-
text << "\n"
|
74
|
-
|
75
|
-
recs.each do |rec|
|
76
|
-
text << values_to_csv( rec )
|
77
|
-
text << "\n"
|
78
|
-
end
|
79
|
-
|
80
|
-
text
|
81
|
-
end
|
82
|
-
|
83
|
-
|
84
|
-
end # module Utils
|
85
|
-
end # module Factbook
|
data/lib/factbook/utils_info.rb
DELETED
@@ -1,129 +0,0 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
|
-
module Factbook
|
4
|
-
module Utils
|
5
|
-
|
6
|
-
#######
|
7
|
-
## find meta data (about page info)
|
8
|
-
|
9
|
-
|
10
|
-
#### e.g. Page last updated on September 16, 2015
|
11
|
-
|
12
|
-
MONTH_EN_TO_S={
|
13
|
-
'January' => '1',
|
14
|
-
'February' => '2',
|
15
|
-
'March' => '3',
|
16
|
-
'April' => '4',
|
17
|
-
'May' => '5',
|
18
|
-
'June' => '6',
|
19
|
-
'July' => '7',
|
20
|
-
'August' => '8',
|
21
|
-
'September' => '9',
|
22
|
-
'October' => '10',
|
23
|
-
'November' => '11',
|
24
|
-
'December' => '12'
|
25
|
-
}
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
##
|
30
|
-
# examples (to match):
|
31
|
-
# Page last updated on November 03, 2016
|
32
|
-
# Page last updated on September 24, 2015
|
33
|
-
|
34
|
-
PAGE_LAST_UPDATED_REGEX = /
|
35
|
-
Page \s last \s updated \s on \s
|
36
|
-
(?<month_en>[a-z]+) \s
|
37
|
-
(?<day>\d{1,2}), \s
|
38
|
-
(?<year>\d{4})
|
39
|
-
/imx
|
40
|
-
|
41
|
-
def find_page_last_updated( html )
|
42
|
-
m = PAGE_LAST_UPDATED_REGEX.match( html )
|
43
|
-
if m
|
44
|
-
pp m
|
45
|
-
month_en = m[:month_en]
|
46
|
-
day = m[:day]
|
47
|
-
year = m[:year]
|
48
|
-
puts "** bingo - month #{month_en}, day #{day}, year #{year}"
|
49
|
-
|
50
|
-
month = MONTH_EN_TO_S[ month_en ]
|
51
|
-
date_str = "#{year}-#{month}-#{day}"
|
52
|
-
pp date_str
|
53
|
-
date = Date.strptime( date_str, '%Y-%m-%d' )
|
54
|
-
date
|
55
|
-
else
|
56
|
-
nil
|
57
|
-
end
|
58
|
-
end
|
59
|
-
|
60
|
-
|
61
|
-
## fallback: find "standalone" country coude
|
62
|
-
## e.g.
|
63
|
-
## ccode='au'
|
64
|
-
|
65
|
-
COUNTRY_CODE_REGEX = /ccode='(?<cc>[a-z]+)'/
|
66
|
-
|
67
|
-
def find_country_code( html )
|
68
|
-
m = COUNTRY_CODE_REGEX.match( html )
|
69
|
-
if m
|
70
|
-
pp m
|
71
|
-
cc = m[:cc]
|
72
|
-
puts "** bingo - country code #{cc}"
|
73
|
-
cc
|
74
|
-
else
|
75
|
-
nil
|
76
|
-
end
|
77
|
-
end
|
78
|
-
|
79
|
-
|
80
|
-
##
|
81
|
-
## e.g. regioncode="eur"
|
82
|
-
## countrycode="au"
|
83
|
-
## countryname="Austria"
|
84
|
-
## flagsubfield=""
|
85
|
-
## countryaffiliation=""
|
86
|
-
## flagdescription=""
|
87
|
-
## flagdescriptionnote=""
|
88
|
-
## region="Europe"
|
89
|
-
##
|
90
|
-
## note: countryaffiliation may be empty
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
PAGE_INFO_REGEX = /
|
95
|
-
regioncode=(?<q1>"|')(?<region_code>.+?)\k<q1>
|
96
|
-
\s+
|
97
|
-
countrycode=(?<q2>"|')(?<country_code>.+?)\k<q2> ## is k<3> backref
|
98
|
-
\s+
|
99
|
-
countryname=(?<q3>"|')(?<country>.+?)\k<q3>
|
100
|
-
\s+
|
101
|
-
[^>]+? ## allow any attribs (note: non-greedy)
|
102
|
-
countryaffiliation=(?<q4>"|')(?<affiliation>.*?)\k<q4> ## note: might be empty
|
103
|
-
\s+
|
104
|
-
[^>]+? ## allow any attribs (note: non-greedy)
|
105
|
-
region=(?<q5>"|')(?<region>.+?)\k<q5> ## check world - might be empty ?? or for ocean ??
|
106
|
-
/imx
|
107
|
-
|
108
|
-
|
109
|
-
def find_page_info( html )
|
110
|
-
m = PAGE_INFO_REGEX.match( html )
|
111
|
-
if m
|
112
|
-
pp m
|
113
|
-
|
114
|
-
h = { country_code: m[:country_code],
|
115
|
-
country_name: m[:country],
|
116
|
-
country_affiliation: m[:affiliation],
|
117
|
-
region_code: m[:region_code],
|
118
|
-
region_name: m[:region] }
|
119
|
-
|
120
|
-
puts "** bingo - #{h.inspect}"
|
121
|
-
h ## return hash w/ name-value pairs
|
122
|
-
else
|
123
|
-
nil ## or return empty struct with nils/empty strings - why?? why not??
|
124
|
-
end
|
125
|
-
end
|
126
|
-
|
127
|
-
|
128
|
-
end # module Utils
|
129
|
-
end # module Factbook
|
data/lib/factbook/version.rb
DELETED
@@ -1,21 +0,0 @@
|
|
1
|
-
|
2
|
-
module Factbook
|
3
|
-
|
4
|
-
MAJOR = 2
|
5
|
-
MINOR = 0
|
6
|
-
PATCH = 0
|
7
|
-
VERSION = [MAJOR,MINOR,PATCH].join('.')
|
8
|
-
|
9
|
-
def self.version
|
10
|
-
VERSION
|
11
|
-
end
|
12
|
-
|
13
|
-
def self.banner
|
14
|
-
"factbook/#{VERSION} on Ruby #{RUBY_VERSION} (#{RUBY_RELEASE_DATE}) [#{RUBY_PLATFORM}]"
|
15
|
-
end
|
16
|
-
|
17
|
-
def self.root
|
18
|
-
File.expand_path( File.dirname(File.dirname(File.dirname(__FILE__))) )
|
19
|
-
end
|
20
|
-
|
21
|
-
end
|
data/script/almanac.rb
DELETED
@@ -1,48 +0,0 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
#
|
3
|
-
# use to run:
|
4
|
-
# ruby -I ./lib script/almanac.rb
|
5
|
-
|
6
|
-
|
7
|
-
require 'factbook'
|
8
|
-
|
9
|
-
|
10
|
-
TEMPLATE = <<EOS
|
11
|
-
|
12
|
-
### <%= names %>
|
13
|
-
|
14
|
-
<%= page.name_long=='none' ? '\-' : page.name_long %> › <%= page.map %> -- <%= page.location %> <br>
|
15
|
-
<%= page.capital %> • <%= page.area %> • pop. <%= page.population %>
|
16
|
-
|
17
|
-
**Languages:** <%= page.languages %>
|
18
|
-
**Major cities:** <%= page.major_cities %>
|
19
|
-
**Ethnic groups:** <%= page.ethnic_groups %>
|
20
|
-
**Religions:** <%= page.religions %>
|
21
|
-
**Independence:** <%= page.independence %>
|
22
|
-
|
23
|
-
**Internet:** `<%= page.internet %>` • <%= page.internet_users %> • <%= page.internet_users_rate %>
|
24
|
-
**Telephones - mobile:** <%= page.telephones_mobile %> • <%= page.telephones_mobile_subscriptions %> subs./100
|
25
|
-
|
26
|
-
EOS
|
27
|
-
|
28
|
-
|
29
|
-
#########################
|
30
|
-
### read all countries
|
31
|
-
### using local json (dump) files
|
32
|
-
|
33
|
-
## see github.com/factbook/factbook.json (use git clone)
|
34
|
-
json_dir = '../../factbook/factbook.json'
|
35
|
-
codes = Factbook.codes.countries
|
36
|
-
## todo: add tawain and ?? from others - why, why not??
|
37
|
-
|
38
|
-
pages = Factbook::JsonPageReader.new( json_dir ).read_pages( codes )
|
39
|
-
|
40
|
-
almanac = Factbook::Almanac.new( pages )
|
41
|
-
|
42
|
-
## save to disk
|
43
|
-
|
44
|
-
File.open( './tmp/ALMANAC.md', 'w' ) do |f|
|
45
|
-
f.write almanac.render( TEMPLATE )
|
46
|
-
end
|
47
|
-
|
48
|
-
puts "Done."
|
data/script/attributes.rb
DELETED
@@ -1,34 +0,0 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
#
|
3
|
-
# use to run:
|
4
|
-
# ruby -I ./lib script/attributes.rb
|
5
|
-
|
6
|
-
|
7
|
-
# e.g. prints attribute accessor shortcuts
|
8
|
-
#
|
9
|
-
# ### Geography
|
10
|
-
#
|
11
|
-
# - `location` => Location
|
12
|
-
# - `coords` => Geographic coordinates
|
13
|
-
# - `map` => Map references
|
14
|
-
# ...
|
15
|
-
|
16
|
-
require 'factbook'
|
17
|
-
|
18
|
-
|
19
|
-
attribs= Factbook.attributes.to_a
|
20
|
-
|
21
|
-
h = attribs.group_by { |a| a.category }
|
22
|
-
|
23
|
-
pp h
|
24
|
-
|
25
|
-
h.each do |k,v|
|
26
|
-
puts ""
|
27
|
-
puts "### #{k}"
|
28
|
-
puts ""
|
29
|
-
|
30
|
-
v.each do |a|
|
31
|
-
puts "- `#{a.name}` => #{a.path.join(' › ')}"
|
32
|
-
end
|
33
|
-
end
|
34
|
-
|
data/script/build.rb
DELETED
@@ -1,28 +0,0 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
#
|
3
|
-
# use to run/test:
|
4
|
-
# ruby -I ./lib script/build.rb
|
5
|
-
|
6
|
-
require 'factbook'
|
7
|
-
|
8
|
-
DB_CONFIG = {
|
9
|
-
adapter: 'sqlite3',
|
10
|
-
database: './factbook.db'
|
11
|
-
}
|
12
|
-
|
13
|
-
ActiveRecord::Base.logger = Logger.new( STDOUT )
|
14
|
-
ActiveRecord::Base.establish_connection( DB_CONFIG )
|
15
|
-
|
16
|
-
Factbook::CreateDb.new.up ## create tables
|
17
|
-
|
18
|
-
importer = Factbook::Importer.new
|
19
|
-
|
20
|
-
Factbook.codes.each do |code|
|
21
|
-
puts "Fetching #{code.code}- #{code.name}..."
|
22
|
-
page = Factbook::Page.new( code.code )
|
23
|
-
|
24
|
-
puts "Adding #{code.code}- #{code.name}..."
|
25
|
-
importer.import( page )
|
26
|
-
end
|
27
|
-
|
28
|
-
puts "Done."
|
data/script/counter.rb
DELETED
@@ -1,145 +0,0 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
#
|
3
|
-
# use to run:
|
4
|
-
# ruby -I ./lib script/counter.rb
|
5
|
-
|
6
|
-
require 'factbook'
|
7
|
-
|
8
|
-
|
9
|
-
c = Factbook::Counter.new
|
10
|
-
|
11
|
-
## see github.com/factbook/factbook.json (use git clone)
|
12
|
-
json_dir = '../../factbook/factbook.json'
|
13
|
-
codes = Factbook.codes
|
14
|
-
|
15
|
-
pages = Factbook::JsonPageReader.new( json_dir ).read_pages( codes )
|
16
|
-
|
17
|
-
pages.each do |page|
|
18
|
-
c.count( page )
|
19
|
-
end
|
20
|
-
|
21
|
-
h = c.data
|
22
|
-
pp h
|
23
|
-
|
24
|
-
### save to json
|
25
|
-
puts "saving a copy to categories.json for debugging"
|
26
|
-
File.open( "tmp/categories.json", 'w' ) do |f|
|
27
|
-
f.write JSON.pretty_generate( h )
|
28
|
-
end
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
SKIP_CATEGORIES_LINES=<<EOS
|
33
|
-
|
34
|
-
######
|
35
|
-
### france plus 5 overseas regions/departments
|
36
|
-
|
37
|
-
## metropolitan France
|
38
|
-
## metropolitan France - total
|
39
|
-
overseas departments
|
40
|
-
French Guiana
|
41
|
-
French Guiana - total
|
42
|
-
Guadeloupe
|
43
|
-
Guadeloupe and Martinique
|
44
|
-
Martinique
|
45
|
-
Mayotte
|
46
|
-
Reunion
|
47
|
-
|
48
|
-
|
49
|
-
###############
|
50
|
-
### more
|
51
|
-
|
52
|
-
Iles Eparses
|
53
|
-
Ile Amsterdam
|
54
|
-
Ile Amsterdam (Ile Amsterdam et Ile Saint-Paul)
|
55
|
-
Ile Amsterdam et Ile Saint-Paul
|
56
|
-
Ile Saint Paul
|
57
|
-
Ile Saint-Paul (Ile Amsterdam et Ile Saint-Paul)
|
58
|
-
Iles Crozet
|
59
|
-
Iles Kerguelen
|
60
|
-
Adelie Land
|
61
|
-
Bassas da India
|
62
|
-
Bassas da India (Iles Eparses)
|
63
|
-
Bassas da India, Europa Island, Glorioso Islands, Juan de Nova Island (Iles Eparses)
|
64
|
-
Europa Island
|
65
|
-
Europa Island (Iles Eparses)
|
66
|
-
Europa Island, Glorioso Islands, Juan de Nova Island
|
67
|
-
Europa Island and Juan de Nova Island (Iles Eparses)
|
68
|
-
Europa Island, Glorioso Islands, Juan de Nova Island (Iles Eparses)
|
69
|
-
Glorioso Islands
|
70
|
-
Glorioso Islands (Iles Eparses)
|
71
|
-
Glorioso Island (Iles Eparses)
|
72
|
-
Juan de Nova Island
|
73
|
-
Juan de Nova Island (Iles Eparses)
|
74
|
-
Tromelin Island
|
75
|
-
Tromelin Island (Iles Eparses)
|
76
|
-
Saint Helena
|
77
|
-
Ascension Island
|
78
|
-
Ascension
|
79
|
-
Tristan da Cunha
|
80
|
-
Tristan da Cunha island group
|
81
|
-
Baker Island
|
82
|
-
Baker, Howland, and Jarvis Islands
|
83
|
-
Baker, Howland, and Jarvis Islands, and Johnston Atoll
|
84
|
-
Baker, Howland, and Jarvis Islands, and Kingman Reef
|
85
|
-
Howland Island
|
86
|
-
Jarvis Island
|
87
|
-
Johnston Atoll
|
88
|
-
Johnston Atoll and Kingman Reef
|
89
|
-
Kingman Reef
|
90
|
-
Midway Islands
|
91
|
-
Midway Islands, Johnston, and Palmyra Atolls
|
92
|
-
Midway Islands and Palmyra Atoll
|
93
|
-
Palmyra Atoll
|
94
|
-
note on Palmyra Atoll
|
95
|
-
EOS
|
96
|
-
|
97
|
-
## allow empty lines and skip comments
|
98
|
-
SKIP_CATEGORIES = SKIP_CATEGORIES_LINES.split("\n").select { |item| !(item =~ /^\s*$/ || item =~ /^\s*#/) }
|
99
|
-
|
100
|
-
|
101
|
-
def print_categories( data )
|
102
|
-
data.each do |k,v|
|
103
|
-
|
104
|
-
puts ""
|
105
|
-
puts "## #{k} _(#{v[:count]})_"
|
106
|
-
puts ""
|
107
|
-
|
108
|
-
walk_categories( v, 1 )
|
109
|
-
end
|
110
|
-
end
|
111
|
-
|
112
|
-
def walk_categories( data, level )
|
113
|
-
data.each do |k,v|
|
114
|
-
next if k == :count || k == :codes ## skip "virtual" count entry (added for stats)
|
115
|
-
|
116
|
-
## skip (sub)country entries e.g. Baker Island, Ile Amsterdam, etc.
|
117
|
-
next if SKIP_CATEGORIES.include?( k )
|
118
|
-
|
119
|
-
print " " * (level-1) if level > 1 ## add 4 spaces indents per extra level
|
120
|
-
print "- "
|
121
|
-
|
122
|
-
print "**" if level == 1 ## mark as bold
|
123
|
-
print k
|
124
|
-
print "**" if level == 1
|
125
|
-
|
126
|
-
print " _("
|
127
|
-
print v[:count]
|
128
|
-
if v[:codes] ## add codes if present
|
129
|
-
print " - "
|
130
|
-
print v[:codes]
|
131
|
-
end
|
132
|
-
print ")_"
|
133
|
-
|
134
|
-
print "\n"
|
135
|
-
|
136
|
-
walk_categories( v, level+1)
|
137
|
-
end
|
138
|
-
end
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
print_categories( c.data )
|
143
|
-
|
144
|
-
puts "Done."
|
145
|
-
|