factbook-readers 1.0.1 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/Manifest.txt +3 -25
- data/README.md +11 -69
- data/Rakefile +3 -3
- data/lib/factbook-readers.rb +5 -40
- data/lib/factbook-readers/convert.rb +37 -0
- data/lib/factbook-readers/counter.rb +7 -9
- data/lib/factbook-readers/page.rb +41 -61
- data/lib/factbook-readers/page_info.rb +15 -3
- data/lib/factbook-readers/version.rb +2 -2
- data/test/helper.rb +3 -0
- data/test/test_counter.rb +9 -6
- data/test/test_download.rb +27 -0
- data/test/test_fields.rb +44 -27
- data/test/test_json.rb +4 -4
- data/test/test_page.rb +8 -8
- data/test/test_version.rb +15 -0
- metadata +11 -48
- data/data/categories.csv +0 -164
- data/data/codes.csv +0 -262
- data/data/codesxref.csv +0 -280
- data/data/comparisons.csv +0 -75
- data/lib/factbook-readers/builder.rb +0 -187
- data/lib/factbook-readers/builder_item.rb +0 -201
- data/lib/factbook-readers/builder_json.rb +0 -68
- data/lib/factbook-readers/codes.rb +0 -121
- data/lib/factbook-readers/comparisons.rb +0 -49
- data/lib/factbook-readers/normalize.rb +0 -42
- data/lib/factbook-readers/reader_json.rb +0 -50
- data/lib/factbook-readers/sanitizer.rb +0 -351
- data/lib/factbook-readers/sect.rb +0 -28
- data/lib/factbook-readers/subsect.rb +0 -17
- data/lib/factbook-readers/table.rb +0 -51
- data/lib/factbook-readers/utils.rb +0 -47
- data/lib/factbook-readers/utils_info.rb +0 -128
- data/test/test_builder.rb +0 -30
- data/test/test_codes.rb +0 -72
- data/test/test_comparisons.rb +0 -16
- data/test/test_item_builder.rb +0 -97
- data/test/test_json_builder.rb +0 -23
- data/test/test_normalize.rb +0 -21
- data/test/test_sanitizer.rb +0 -36
- data/test/test_sanitizer_regex.rb +0 -87
@@ -1,28 +0,0 @@
|
|
1
|
-
|
2
|
-
module Factbook
|
3
|
-
|
4
|
-
|
5
|
-
class Sect
|
6
|
-
include LogUtils::Logging
|
7
|
-
|
8
|
-
attr_accessor :title ## use name instead of title - why? why not?
|
9
|
-
attr_accessor :subsects
|
10
|
-
|
11
|
-
def initialize
|
12
|
-
@subsects = []
|
13
|
-
end
|
14
|
-
|
15
|
-
def data
|
16
|
-
## convert sects to hash
|
17
|
-
@data = {}
|
18
|
-
|
19
|
-
subsects.each_with_index do |subsect,i|
|
20
|
-
@data[ subsect.title ] = subsect.data
|
21
|
-
end
|
22
|
-
@data
|
23
|
-
end
|
24
|
-
|
25
|
-
|
26
|
-
end # class Sect
|
27
|
-
|
28
|
-
end # module Factbook
|
@@ -1,17 +0,0 @@
|
|
1
|
-
|
2
|
-
module Factbook
|
3
|
-
|
4
|
-
|
5
|
-
class Subsect
|
6
|
-
include LogUtils::Logging
|
7
|
-
|
8
|
-
attr_accessor :title ## use name instead of title - why? why not?
|
9
|
-
attr_accessor :data ## hash holding data e.g. { 'text' => '...' etc. }
|
10
|
-
|
11
|
-
def initialize
|
12
|
-
@data = {}
|
13
|
-
end
|
14
|
-
|
15
|
-
end # class Subsect
|
16
|
-
|
17
|
-
end # module Factbook
|
@@ -1,51 +0,0 @@
|
|
1
|
-
|
2
|
-
module Factbook
|
3
|
-
|
4
|
-
##
|
5
|
-
## make more "generic" - why? why not?
|
6
|
-
## (re)use for other files ?? move to textutils ??
|
7
|
-
|
8
|
-
##
|
9
|
-
## for now reads in rows with values separated by at least 3+ spaces e.g.:
|
10
|
-
## see www.cia.gov/library/publications/the-world-factbook/rankorder/rawdata_2119.txt
|
11
|
-
## 1 China 1,367,485,388
|
12
|
-
## 2 India 1,251,695,584
|
13
|
-
## 3 European Union 513,949,445
|
14
|
-
## 4 United States 321,368,864
|
15
|
-
## 5 Indonesia 255,993,674
|
16
|
-
## 6 Brazil 204,259,812
|
17
|
-
|
18
|
-
|
19
|
-
class TableReader
|
20
|
-
include LogUtils::Logging
|
21
|
-
|
22
|
-
|
23
|
-
def initialize( text )
|
24
|
-
@text = text
|
25
|
-
end
|
26
|
-
|
27
|
-
def read
|
28
|
-
recs = []
|
29
|
-
|
30
|
-
line_no = 0
|
31
|
-
@text.each_line do |line|
|
32
|
-
line_no +=1
|
33
|
-
line = line.strip ## remove leading and trailing whitespace
|
34
|
-
if line.empty?
|
35
|
-
puts "** skipping empty line #{line_no}"
|
36
|
-
next
|
37
|
-
end
|
38
|
-
|
39
|
-
values = line.split( /[ ]{3,}/ ) ## split three or more spaces - use just two ?? why? why not??
|
40
|
-
|
41
|
-
## puts line
|
42
|
-
## pp values
|
43
|
-
recs << values
|
44
|
-
end
|
45
|
-
recs
|
46
|
-
end
|
47
|
-
|
48
|
-
|
49
|
-
end # class TableReader
|
50
|
-
|
51
|
-
end # module Factbook
|
@@ -1,47 +0,0 @@
|
|
1
|
-
|
2
|
-
module Factbook
|
3
|
-
module Utils
|
4
|
-
|
5
|
-
|
6
|
-
def values_to_csv( values )
|
7
|
-
buf = ""
|
8
|
-
values.each_with_index do |value,i|
|
9
|
-
buf << ',' if i > 0 ## add comma (except for first value)
|
10
|
-
## note: allow optional $ sign e.g. $100,000,000
|
11
|
-
## !!!! todo/fix: allow optional minus e.g. -44,000
|
12
|
-
if value =~ /^\$?[1-9][,0-9]+[0-9]$/ ### find a better regex - why? why not??
|
13
|
-
## check if number e.g. 17,098,242 or $17,098,242
|
14
|
-
## remove commas 17098242
|
15
|
-
buf << value.gsub( ',', '' )
|
16
|
-
elsif value.index( ',').nil?
|
17
|
-
## add as is 1:1 (no commana)
|
18
|
-
buf << value
|
19
|
-
else
|
20
|
-
## escape comma with double quote
|
21
|
-
# e.g. Guam, The becomes "Guam, The"
|
22
|
-
buf << '"'
|
23
|
-
buf << value
|
24
|
-
buf << '"'
|
25
|
-
end
|
26
|
-
end
|
27
|
-
buf
|
28
|
-
end
|
29
|
-
|
30
|
-
|
31
|
-
def data_to_csv( recs, headers )
|
32
|
-
text = ""
|
33
|
-
|
34
|
-
text << values_to_csv( headers )
|
35
|
-
text << "\n"
|
36
|
-
|
37
|
-
recs.each do |rec|
|
38
|
-
text << values_to_csv( rec )
|
39
|
-
text << "\n"
|
40
|
-
end
|
41
|
-
|
42
|
-
text
|
43
|
-
end
|
44
|
-
|
45
|
-
|
46
|
-
end # module Utils
|
47
|
-
end # module Factbook
|
@@ -1,128 +0,0 @@
|
|
1
|
-
|
2
|
-
module Factbook
|
3
|
-
module Utils
|
4
|
-
|
5
|
-
#######
|
6
|
-
## find meta data (about page info)
|
7
|
-
|
8
|
-
|
9
|
-
#### e.g. Page last updated on September 16, 2015
|
10
|
-
|
11
|
-
MONTH_EN_TO_S={
|
12
|
-
'January' => '1',
|
13
|
-
'February' => '2',
|
14
|
-
'March' => '3',
|
15
|
-
'April' => '4',
|
16
|
-
'May' => '5',
|
17
|
-
'June' => '6',
|
18
|
-
'July' => '7',
|
19
|
-
'August' => '8',
|
20
|
-
'September' => '9',
|
21
|
-
'October' => '10',
|
22
|
-
'November' => '11',
|
23
|
-
'December' => '12'
|
24
|
-
}
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
##
|
29
|
-
# examples (to match):
|
30
|
-
# Page last updated on November 03, 2016
|
31
|
-
# Page last updated on September 24, 2015
|
32
|
-
|
33
|
-
PAGE_LAST_UPDATED_REGEX = /
|
34
|
-
Page \s last \s updated \s on \s
|
35
|
-
(?<month_en>[a-z]+) \s
|
36
|
-
(?<day>\d{1,2}), \s
|
37
|
-
(?<year>\d{4})
|
38
|
-
/imx
|
39
|
-
|
40
|
-
def find_page_last_updated( html )
|
41
|
-
m = PAGE_LAST_UPDATED_REGEX.match( html )
|
42
|
-
if m
|
43
|
-
pp m
|
44
|
-
month_en = m[:month_en]
|
45
|
-
day = m[:day]
|
46
|
-
year = m[:year]
|
47
|
-
puts "** bingo - month #{month_en}, day #{day}, year #{year}"
|
48
|
-
|
49
|
-
month = MONTH_EN_TO_S[ month_en ]
|
50
|
-
date_str = "#{year}-#{month}-#{day}"
|
51
|
-
pp date_str
|
52
|
-
date = Date.strptime( date_str, '%Y-%m-%d' )
|
53
|
-
date
|
54
|
-
else
|
55
|
-
nil
|
56
|
-
end
|
57
|
-
end
|
58
|
-
|
59
|
-
|
60
|
-
## fallback: find "standalone" country coude
|
61
|
-
## e.g.
|
62
|
-
## ccode='au'
|
63
|
-
|
64
|
-
COUNTRY_CODE_REGEX = /ccode='(?<cc>[a-z]+)'/
|
65
|
-
|
66
|
-
def find_country_code( html )
|
67
|
-
m = COUNTRY_CODE_REGEX.match( html )
|
68
|
-
if m
|
69
|
-
pp m
|
70
|
-
cc = m[:cc]
|
71
|
-
puts "** bingo - country code #{cc}"
|
72
|
-
cc
|
73
|
-
else
|
74
|
-
nil
|
75
|
-
end
|
76
|
-
end
|
77
|
-
|
78
|
-
|
79
|
-
##
|
80
|
-
## e.g. regioncode="eur"
|
81
|
-
## countrycode="au"
|
82
|
-
## countryname="Austria"
|
83
|
-
## flagsubfield=""
|
84
|
-
## countryaffiliation=""
|
85
|
-
## flagdescription=""
|
86
|
-
## flagdescriptionnote=""
|
87
|
-
## region="Europe"
|
88
|
-
##
|
89
|
-
## note: countryaffiliation may be empty
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
PAGE_INFO_REGEX = /
|
94
|
-
regioncode=(?<q1>"|')(?<region_code>.+?)\k<q1>
|
95
|
-
\s+
|
96
|
-
countrycode=(?<q2>"|')(?<country_code>.+?)\k<q2> ## is k<3> backref
|
97
|
-
\s+
|
98
|
-
countryname=(?<q3>"|')(?<country>.+?)\k<q3>
|
99
|
-
\s+
|
100
|
-
[^>]+? ## allow any attribs (note: non-greedy)
|
101
|
-
countryaffiliation=(?<q4>"|')(?<affiliation>.*?)\k<q4> ## note: might be empty
|
102
|
-
\s+
|
103
|
-
[^>]+? ## allow any attribs (note: non-greedy)
|
104
|
-
region=(?<q5>"|')(?<region>.+?)\k<q5> ## check world - might be empty ?? or for ocean ??
|
105
|
-
/imx
|
106
|
-
|
107
|
-
|
108
|
-
def find_page_info( html )
|
109
|
-
m = PAGE_INFO_REGEX.match( html )
|
110
|
-
if m
|
111
|
-
pp m
|
112
|
-
|
113
|
-
h = { country_code: m[:country_code],
|
114
|
-
country_name: m[:country],
|
115
|
-
country_affiliation: m[:affiliation],
|
116
|
-
region_code: m[:region_code],
|
117
|
-
region_name: m[:region] }
|
118
|
-
|
119
|
-
puts "** bingo - #{h.inspect}"
|
120
|
-
h ## return hash w/ name-value pairs
|
121
|
-
else
|
122
|
-
nil ## or return empty struct with nils/empty strings - why?? why not??
|
123
|
-
end
|
124
|
-
end
|
125
|
-
|
126
|
-
|
127
|
-
end # module Utils
|
128
|
-
end # module Factbook
|
data/test/test_builder.rb
DELETED
@@ -1,30 +0,0 @@
|
|
1
|
-
###
|
2
|
-
# to run use
|
3
|
-
# ruby -I ./lib -I ./test test/test_builder.rb
|
4
|
-
|
5
|
-
|
6
|
-
require 'helper'
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
class TestBuilder < MiniTest::Test
|
11
|
-
|
12
|
-
def test_build
|
13
|
-
|
14
|
-
['au','be'].each do |code|
|
15
|
-
## use/fix: ASCII-8BIT (e.g.keep as is) -???
|
16
|
-
## fix/todo: use ASCII8BIT/binary reader ??
|
17
|
-
b = Factbook::Builder.from_file( "#{Factbook.root}/test/data/src/#{code}.html" )
|
18
|
-
pp b.sects
|
19
|
-
|
20
|
-
File.open( "./tmp/#{code}.debug.html", 'w' ) do |f|
|
21
|
-
f.write b.html_debug
|
22
|
-
end
|
23
|
-
end
|
24
|
-
|
25
|
-
assert true ## assume everthing ok
|
26
|
-
end
|
27
|
-
|
28
|
-
|
29
|
-
end # class TestBuilder
|
30
|
-
|
data/test/test_codes.rb
DELETED
@@ -1,72 +0,0 @@
|
|
1
|
-
###
|
2
|
-
# to run use
|
3
|
-
# ruby -I ./lib -I ./test test/test_codes.rb
|
4
|
-
|
5
|
-
|
6
|
-
require 'helper'
|
7
|
-
|
8
|
-
|
9
|
-
class TestCodes < MiniTest::Test
|
10
|
-
|
11
|
-
|
12
|
-
def test_codes
|
13
|
-
assert_equal 261, Factbook.codes.size
|
14
|
-
assert_equal 261, Factbook.codes.to_a.size
|
15
|
-
|
16
|
-
|
17
|
-
assert_equal 195, Factbook.codes.countries.size
|
18
|
-
assert_equal 52, Factbook.codes.dependencies.size
|
19
|
-
assert_equal 5, Factbook.codes.oceans.size
|
20
|
-
assert_equal 1, Factbook.codes.world.size
|
21
|
-
assert_equal 2, Factbook.codes.others.size
|
22
|
-
assert_equal 6, Factbook.codes.misc.size
|
23
|
-
|
24
|
-
assert_equal 8, Factbook.codes.dependencies_us.size
|
25
|
-
|
26
|
-
|
27
|
-
assert_equal 55, Factbook.codes.europe.size
|
28
|
-
assert_equal 9, Factbook.codes.south_asia.size
|
29
|
-
assert_equal 6, Factbook.codes.central_asia.size
|
30
|
-
assert_equal 22, Factbook.codes.east_n_souteast_asia.size
|
31
|
-
assert_equal 19, Factbook.codes.middle_east.size
|
32
|
-
assert_equal 56, Factbook.codes.africa.size
|
33
|
-
assert_equal 7, Factbook.codes.north_america.size
|
34
|
-
assert_equal 33, Factbook.codes.central_america_n_caribbean.size
|
35
|
-
assert_equal 14, Factbook.codes.south_america.size
|
36
|
-
assert_equal 30, Factbook.codes.australia_oceania.size
|
37
|
-
assert_equal 4, Factbook.codes.antartica.size
|
38
|
-
assert_equal 5, Factbook.codes.region('Oceans').size
|
39
|
-
assert_equal 1, Factbook.codes.region('World').size
|
40
|
-
|
41
|
-
assert_equal 45, Factbook.codes.countries.europe.size
|
42
|
-
|
43
|
-
assert_equal Factbook.codes.category('Oceans').size, Factbook.codes.region('Oceans').size
|
44
|
-
assert_equal Factbook.codes.category('World').size, Factbook.codes.region('World').size
|
45
|
-
|
46
|
-
|
47
|
-
assert_equal 261, Factbook.codes.countries.size +
|
48
|
-
Factbook.codes.others.size +
|
49
|
-
Factbook.codes.dependencies.size +
|
50
|
-
Factbook.codes.misc.size +
|
51
|
-
Factbook.codes.oceans.size +
|
52
|
-
Factbook.codes.world.size
|
53
|
-
|
54
|
-
assert_equal 261, Factbook.codes.europe.size +
|
55
|
-
Factbook.codes.south_asia.size +
|
56
|
-
Factbook.codes.central_asia.size +
|
57
|
-
Factbook.codes.east_n_souteast_asia.size +
|
58
|
-
Factbook.codes.middle_east.size +
|
59
|
-
Factbook.codes.africa.size +
|
60
|
-
Factbook.codes.north_america.size +
|
61
|
-
Factbook.codes.central_america_n_caribbean.size +
|
62
|
-
Factbook.codes.south_america.size +
|
63
|
-
Factbook.codes.australia_oceania.size +
|
64
|
-
Factbook.codes.antartica.size +
|
65
|
-
Factbook.codes.region('Oceans').size +
|
66
|
-
Factbook.codes.region('World').size
|
67
|
-
|
68
|
-
end
|
69
|
-
|
70
|
-
end # class TestCodes
|
71
|
-
|
72
|
-
|
data/test/test_comparisons.rb
DELETED
@@ -1,16 +0,0 @@
|
|
1
|
-
###
|
2
|
-
# to run use
|
3
|
-
# ruby -I ./lib -I ./test test/test_comparisons.rb
|
4
|
-
|
5
|
-
|
6
|
-
require 'helper'
|
7
|
-
|
8
|
-
|
9
|
-
class TestComparisons < MiniTest::Test
|
10
|
-
|
11
|
-
def test_comparisons
|
12
|
-
assert_equal 74, Factbook.comparisons.size
|
13
|
-
assert_equal 74, Factbook.comparisons.to_a.size
|
14
|
-
end
|
15
|
-
|
16
|
-
end # class TestComparisons
|
data/test/test_item_builder.rb
DELETED
@@ -1,97 +0,0 @@
|
|
1
|
-
###
|
2
|
-
# to run use
|
3
|
-
# ruby -I ./lib -I ./test test/test_item_builder.rb
|
4
|
-
|
5
|
-
|
6
|
-
require 'helper'
|
7
|
-
|
8
|
-
|
9
|
-
class TestItemBuilder < MiniTest::Test
|
10
|
-
|
11
|
-
def test_location
|
12
|
-
|
13
|
-
html =<<EOS
|
14
|
-
<div class=category_data>Central Europe, north of Italy and Slovenia</div>
|
15
|
-
EOS
|
16
|
-
|
17
|
-
b = Factbook::ItemBuilder.new( html, 'Location' )
|
18
|
-
b.read
|
19
|
-
|
20
|
-
assert true ## assume everthing ok
|
21
|
-
end
|
22
|
-
|
23
|
-
def test_area
|
24
|
-
html =<<EOS
|
25
|
-
<div><span class=category>total: </span><span class=category_data>83,871 sq km</span></div>
|
26
|
-
<div><span class=category>land: </span><span class=category_data>82,445 sq km</span></div>
|
27
|
-
<div><span class=category>water: </span><span class=category_data>1,426 sq km</span></div>
|
28
|
-
EOS
|
29
|
-
|
30
|
-
b = Factbook::ItemBuilder.new( html, 'Area' )
|
31
|
-
b.read
|
32
|
-
|
33
|
-
assert true ## assume everthing ok
|
34
|
-
end
|
35
|
-
|
36
|
-
def test_land_use
|
37
|
-
html =<<EOS
|
38
|
-
<div><span class=category>agricultural land: </span><span class=category_data>38.4%</span></div>
|
39
|
-
<div class=category_data>arable land 16.5%; permanent crops 0.8%; permanent pasture 21.1%</div>
|
40
|
-
<div><span class=category>forest: </span><span class=category_data>47.2%</span></div>
|
41
|
-
<div><span class=category>other: </span><span class=category_data>14.4% (2011 est.)</span></div>
|
42
|
-
EOS
|
43
|
-
|
44
|
-
b = Factbook::ItemBuilder.new( html, 'Land use' )
|
45
|
-
b.read
|
46
|
-
|
47
|
-
assert true ## assume everthing ok
|
48
|
-
end
|
49
|
-
|
50
|
-
def test_contraceptive_prevalence_rate
|
51
|
-
html =<<EOS
|
52
|
-
<div class=category_data>69.6%</div>
|
53
|
-
<div><span class=category>note: </span><span class=category_data>percent of women aged 18-46 (2008/09)</span></div>
|
54
|
-
EOS
|
55
|
-
|
56
|
-
b = Factbook::ItemBuilder.new( html, 'Contraceptive Prevalence Rate' )
|
57
|
-
b.read
|
58
|
-
|
59
|
-
assert true ## assume everthing ok
|
60
|
-
end
|
61
|
-
|
62
|
-
def test_drinking_water_source
|
63
|
-
html =<<EOS
|
64
|
-
<div><span class=category>improved: </span><span class=category_data></span></div>
|
65
|
-
<div class=category_data>urban: 100% of population</div>
|
66
|
-
<div class=category_data>rural: 100% of population</div>
|
67
|
-
<div class=category_data>total: 100% of population</div>
|
68
|
-
<div><span class=category>unimproved: </span><span class=category_data></span></div>
|
69
|
-
<div class=category_data>urban: 0% of population</div>
|
70
|
-
<div class=category_data>rural: 0% of population</div>
|
71
|
-
<div class=category_data>total: 0% of population (2015 est.)</div>
|
72
|
-
EOS
|
73
|
-
|
74
|
-
b = Factbook::ItemBuilder.new( html, 'Drinking Water Source' )
|
75
|
-
b.read
|
76
|
-
|
77
|
-
assert true ## assume everthing ok
|
78
|
-
end
|
79
|
-
|
80
|
-
def test_political_pressure_groups_and_leaders
|
81
|
-
html =<<EOS
|
82
|
-
<div class=category_data>Austrian Trade Union Federation or OeGB (nominally independent but primarily Social Democratic)</div>
|
83
|
-
<div class=category_data>Federal Economic Chamber (OeVP-dominated)</div>
|
84
|
-
<div class=category_data>Labor Chamber or AK (Social Democratic-leaning think tank)</div>
|
85
|
-
<div class=category_data>OeVP-oriented Association of Austrian Industrialists or IV</div>
|
86
|
-
<div class=category_data>Roman Catholic Church, including its chief lay organization, Catholic Action</div>
|
87
|
-
<div><span class=category>other: </span><span class=category_data>three composite leagues of the Austrian People's Party or OeVP representing business, labor, farmers, and other nongovernment organizations in the areas of environment and human rights</span></div>
|
88
|
-
EOS
|
89
|
-
|
90
|
-
b = Factbook::ItemBuilder.new( html, 'Political pressure groups and leaders' )
|
91
|
-
b.read
|
92
|
-
|
93
|
-
assert true ## assume everthing ok
|
94
|
-
end
|
95
|
-
|
96
|
-
end # class TestItemBuilder
|
97
|
-
|