factbook-readers 1.0.1 → 1.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/Manifest.txt +3 -25
- data/README.md +11 -69
- data/Rakefile +3 -3
- data/lib/factbook-readers.rb +5 -40
- data/lib/factbook-readers/convert.rb +37 -0
- data/lib/factbook-readers/counter.rb +7 -9
- data/lib/factbook-readers/page.rb +41 -61
- data/lib/factbook-readers/page_info.rb +15 -3
- data/lib/factbook-readers/version.rb +2 -2
- data/test/helper.rb +3 -0
- data/test/test_counter.rb +9 -6
- data/test/test_download.rb +27 -0
- data/test/test_fields.rb +44 -27
- data/test/test_json.rb +4 -4
- data/test/test_page.rb +8 -8
- data/test/test_version.rb +15 -0
- metadata +11 -48
- data/data/categories.csv +0 -164
- data/data/codes.csv +0 -262
- data/data/codesxref.csv +0 -280
- data/data/comparisons.csv +0 -75
- data/lib/factbook-readers/builder.rb +0 -187
- data/lib/factbook-readers/builder_item.rb +0 -201
- data/lib/factbook-readers/builder_json.rb +0 -68
- data/lib/factbook-readers/codes.rb +0 -121
- data/lib/factbook-readers/comparisons.rb +0 -49
- data/lib/factbook-readers/normalize.rb +0 -42
- data/lib/factbook-readers/reader_json.rb +0 -50
- data/lib/factbook-readers/sanitizer.rb +0 -351
- data/lib/factbook-readers/sect.rb +0 -28
- data/lib/factbook-readers/subsect.rb +0 -17
- data/lib/factbook-readers/table.rb +0 -51
- data/lib/factbook-readers/utils.rb +0 -47
- data/lib/factbook-readers/utils_info.rb +0 -128
- data/test/test_builder.rb +0 -30
- data/test/test_codes.rb +0 -72
- data/test/test_comparisons.rb +0 -16
- data/test/test_item_builder.rb +0 -97
- data/test/test_json_builder.rb +0 -23
- data/test/test_normalize.rb +0 -21
- data/test/test_sanitizer.rb +0 -36
- data/test/test_sanitizer_regex.rb +0 -87
@@ -1,28 +0,0 @@
|
|
1
|
-
|
2
|
-
module Factbook
|
3
|
-
|
4
|
-
|
5
|
-
class Sect
|
6
|
-
include LogUtils::Logging
|
7
|
-
|
8
|
-
attr_accessor :title ## use name instead of title - why? why not?
|
9
|
-
attr_accessor :subsects
|
10
|
-
|
11
|
-
def initialize
|
12
|
-
@subsects = []
|
13
|
-
end
|
14
|
-
|
15
|
-
def data
|
16
|
-
## convert sects to hash
|
17
|
-
@data = {}
|
18
|
-
|
19
|
-
subsects.each_with_index do |subsect,i|
|
20
|
-
@data[ subsect.title ] = subsect.data
|
21
|
-
end
|
22
|
-
@data
|
23
|
-
end
|
24
|
-
|
25
|
-
|
26
|
-
end # class Sect
|
27
|
-
|
28
|
-
end # module Factbook
|
@@ -1,17 +0,0 @@
|
|
1
|
-
|
2
|
-
module Factbook
|
3
|
-
|
4
|
-
|
5
|
-
class Subsect
|
6
|
-
include LogUtils::Logging
|
7
|
-
|
8
|
-
attr_accessor :title ## use name instead of title - why? why not?
|
9
|
-
attr_accessor :data ## hash holding data e.g. { 'text' => '...' etc. }
|
10
|
-
|
11
|
-
def initialize
|
12
|
-
@data = {}
|
13
|
-
end
|
14
|
-
|
15
|
-
end # class Subsect
|
16
|
-
|
17
|
-
end # module Factbook
|
@@ -1,51 +0,0 @@
|
|
1
|
-
|
2
|
-
module Factbook
|
3
|
-
|
4
|
-
##
|
5
|
-
## make more "generic" - why? why not?
|
6
|
-
## (re)use for other files ?? move to textutils ??
|
7
|
-
|
8
|
-
##
|
9
|
-
## for now reads in rows with values separated by at least 3+ spaces e.g.:
|
10
|
-
## see www.cia.gov/library/publications/the-world-factbook/rankorder/rawdata_2119.txt
|
11
|
-
## 1 China 1,367,485,388
|
12
|
-
## 2 India 1,251,695,584
|
13
|
-
## 3 European Union 513,949,445
|
14
|
-
## 4 United States 321,368,864
|
15
|
-
## 5 Indonesia 255,993,674
|
16
|
-
## 6 Brazil 204,259,812
|
17
|
-
|
18
|
-
|
19
|
-
class TableReader
|
20
|
-
include LogUtils::Logging
|
21
|
-
|
22
|
-
|
23
|
-
def initialize( text )
|
24
|
-
@text = text
|
25
|
-
end
|
26
|
-
|
27
|
-
def read
|
28
|
-
recs = []
|
29
|
-
|
30
|
-
line_no = 0
|
31
|
-
@text.each_line do |line|
|
32
|
-
line_no +=1
|
33
|
-
line = line.strip ## remove leading and trailing whitespace
|
34
|
-
if line.empty?
|
35
|
-
puts "** skipping empty line #{line_no}"
|
36
|
-
next
|
37
|
-
end
|
38
|
-
|
39
|
-
values = line.split( /[ ]{3,}/ ) ## split three or more spaces - use just two ?? why? why not??
|
40
|
-
|
41
|
-
## puts line
|
42
|
-
## pp values
|
43
|
-
recs << values
|
44
|
-
end
|
45
|
-
recs
|
46
|
-
end
|
47
|
-
|
48
|
-
|
49
|
-
end # class TableReader
|
50
|
-
|
51
|
-
end # module Factbook
|
@@ -1,47 +0,0 @@
|
|
1
|
-
|
2
|
-
module Factbook
|
3
|
-
module Utils
|
4
|
-
|
5
|
-
|
6
|
-
def values_to_csv( values )
|
7
|
-
buf = ""
|
8
|
-
values.each_with_index do |value,i|
|
9
|
-
buf << ',' if i > 0 ## add comma (except for first value)
|
10
|
-
## note: allow optional $ sign e.g. $100,000,000
|
11
|
-
## !!!! todo/fix: allow optional minus e.g. -44,000
|
12
|
-
if value =~ /^\$?[1-9][,0-9]+[0-9]$/ ### find a better regex - why? why not??
|
13
|
-
## check if number e.g. 17,098,242 or $17,098,242
|
14
|
-
## remove commas 17098242
|
15
|
-
buf << value.gsub( ',', '' )
|
16
|
-
elsif value.index( ',').nil?
|
17
|
-
## add as is 1:1 (no commana)
|
18
|
-
buf << value
|
19
|
-
else
|
20
|
-
## escape comma with double quote
|
21
|
-
# e.g. Guam, The becomes "Guam, The"
|
22
|
-
buf << '"'
|
23
|
-
buf << value
|
24
|
-
buf << '"'
|
25
|
-
end
|
26
|
-
end
|
27
|
-
buf
|
28
|
-
end
|
29
|
-
|
30
|
-
|
31
|
-
def data_to_csv( recs, headers )
|
32
|
-
text = ""
|
33
|
-
|
34
|
-
text << values_to_csv( headers )
|
35
|
-
text << "\n"
|
36
|
-
|
37
|
-
recs.each do |rec|
|
38
|
-
text << values_to_csv( rec )
|
39
|
-
text << "\n"
|
40
|
-
end
|
41
|
-
|
42
|
-
text
|
43
|
-
end
|
44
|
-
|
45
|
-
|
46
|
-
end # module Utils
|
47
|
-
end # module Factbook
|
@@ -1,128 +0,0 @@
|
|
1
|
-
|
2
|
-
module Factbook
|
3
|
-
module Utils
|
4
|
-
|
5
|
-
#######
|
6
|
-
## find meta data (about page info)
|
7
|
-
|
8
|
-
|
9
|
-
#### e.g. Page last updated on September 16, 2015
|
10
|
-
|
11
|
-
MONTH_EN_TO_S={
|
12
|
-
'January' => '1',
|
13
|
-
'February' => '2',
|
14
|
-
'March' => '3',
|
15
|
-
'April' => '4',
|
16
|
-
'May' => '5',
|
17
|
-
'June' => '6',
|
18
|
-
'July' => '7',
|
19
|
-
'August' => '8',
|
20
|
-
'September' => '9',
|
21
|
-
'October' => '10',
|
22
|
-
'November' => '11',
|
23
|
-
'December' => '12'
|
24
|
-
}
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
##
|
29
|
-
# examples (to match):
|
30
|
-
# Page last updated on November 03, 2016
|
31
|
-
# Page last updated on September 24, 2015
|
32
|
-
|
33
|
-
PAGE_LAST_UPDATED_REGEX = /
|
34
|
-
Page \s last \s updated \s on \s
|
35
|
-
(?<month_en>[a-z]+) \s
|
36
|
-
(?<day>\d{1,2}), \s
|
37
|
-
(?<year>\d{4})
|
38
|
-
/imx
|
39
|
-
|
40
|
-
def find_page_last_updated( html )
|
41
|
-
m = PAGE_LAST_UPDATED_REGEX.match( html )
|
42
|
-
if m
|
43
|
-
pp m
|
44
|
-
month_en = m[:month_en]
|
45
|
-
day = m[:day]
|
46
|
-
year = m[:year]
|
47
|
-
puts "** bingo - month #{month_en}, day #{day}, year #{year}"
|
48
|
-
|
49
|
-
month = MONTH_EN_TO_S[ month_en ]
|
50
|
-
date_str = "#{year}-#{month}-#{day}"
|
51
|
-
pp date_str
|
52
|
-
date = Date.strptime( date_str, '%Y-%m-%d' )
|
53
|
-
date
|
54
|
-
else
|
55
|
-
nil
|
56
|
-
end
|
57
|
-
end
|
58
|
-
|
59
|
-
|
60
|
-
## fallback: find "standalone" country coude
|
61
|
-
## e.g.
|
62
|
-
## ccode='au'
|
63
|
-
|
64
|
-
COUNTRY_CODE_REGEX = /ccode='(?<cc>[a-z]+)'/
|
65
|
-
|
66
|
-
def find_country_code( html )
|
67
|
-
m = COUNTRY_CODE_REGEX.match( html )
|
68
|
-
if m
|
69
|
-
pp m
|
70
|
-
cc = m[:cc]
|
71
|
-
puts "** bingo - country code #{cc}"
|
72
|
-
cc
|
73
|
-
else
|
74
|
-
nil
|
75
|
-
end
|
76
|
-
end
|
77
|
-
|
78
|
-
|
79
|
-
##
|
80
|
-
## e.g. regioncode="eur"
|
81
|
-
## countrycode="au"
|
82
|
-
## countryname="Austria"
|
83
|
-
## flagsubfield=""
|
84
|
-
## countryaffiliation=""
|
85
|
-
## flagdescription=""
|
86
|
-
## flagdescriptionnote=""
|
87
|
-
## region="Europe"
|
88
|
-
##
|
89
|
-
## note: countryaffiliation may be empty
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
PAGE_INFO_REGEX = /
|
94
|
-
regioncode=(?<q1>"|')(?<region_code>.+?)\k<q1>
|
95
|
-
\s+
|
96
|
-
countrycode=(?<q2>"|')(?<country_code>.+?)\k<q2> ## is k<3> backref
|
97
|
-
\s+
|
98
|
-
countryname=(?<q3>"|')(?<country>.+?)\k<q3>
|
99
|
-
\s+
|
100
|
-
[^>]+? ## allow any attribs (note: non-greedy)
|
101
|
-
countryaffiliation=(?<q4>"|')(?<affiliation>.*?)\k<q4> ## note: might be empty
|
102
|
-
\s+
|
103
|
-
[^>]+? ## allow any attribs (note: non-greedy)
|
104
|
-
region=(?<q5>"|')(?<region>.+?)\k<q5> ## check world - might be empty ?? or for ocean ??
|
105
|
-
/imx
|
106
|
-
|
107
|
-
|
108
|
-
def find_page_info( html )
|
109
|
-
m = PAGE_INFO_REGEX.match( html )
|
110
|
-
if m
|
111
|
-
pp m
|
112
|
-
|
113
|
-
h = { country_code: m[:country_code],
|
114
|
-
country_name: m[:country],
|
115
|
-
country_affiliation: m[:affiliation],
|
116
|
-
region_code: m[:region_code],
|
117
|
-
region_name: m[:region] }
|
118
|
-
|
119
|
-
puts "** bingo - #{h.inspect}"
|
120
|
-
h ## return hash w/ name-value pairs
|
121
|
-
else
|
122
|
-
nil ## or return empty struct with nils/empty strings - why?? why not??
|
123
|
-
end
|
124
|
-
end
|
125
|
-
|
126
|
-
|
127
|
-
end # module Utils
|
128
|
-
end # module Factbook
|
data/test/test_builder.rb
DELETED
@@ -1,30 +0,0 @@
|
|
1
|
-
###
|
2
|
-
# to run use
|
3
|
-
# ruby -I ./lib -I ./test test/test_builder.rb
|
4
|
-
|
5
|
-
|
6
|
-
require 'helper'
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
class TestBuilder < MiniTest::Test
|
11
|
-
|
12
|
-
def test_build
|
13
|
-
|
14
|
-
['au','be'].each do |code|
|
15
|
-
## use/fix: ASCII-8BIT (e.g.keep as is) -???
|
16
|
-
## fix/todo: use ASCII8BIT/binary reader ??
|
17
|
-
b = Factbook::Builder.from_file( "#{Factbook.root}/test/data/src/#{code}.html" )
|
18
|
-
pp b.sects
|
19
|
-
|
20
|
-
File.open( "./tmp/#{code}.debug.html", 'w' ) do |f|
|
21
|
-
f.write b.html_debug
|
22
|
-
end
|
23
|
-
end
|
24
|
-
|
25
|
-
assert true ## assume everthing ok
|
26
|
-
end
|
27
|
-
|
28
|
-
|
29
|
-
end # class TestBuilder
|
30
|
-
|
data/test/test_codes.rb
DELETED
@@ -1,72 +0,0 @@
|
|
1
|
-
###
|
2
|
-
# to run use
|
3
|
-
# ruby -I ./lib -I ./test test/test_codes.rb
|
4
|
-
|
5
|
-
|
6
|
-
require 'helper'
|
7
|
-
|
8
|
-
|
9
|
-
class TestCodes < MiniTest::Test
|
10
|
-
|
11
|
-
|
12
|
-
def test_codes
|
13
|
-
assert_equal 261, Factbook.codes.size
|
14
|
-
assert_equal 261, Factbook.codes.to_a.size
|
15
|
-
|
16
|
-
|
17
|
-
assert_equal 195, Factbook.codes.countries.size
|
18
|
-
assert_equal 52, Factbook.codes.dependencies.size
|
19
|
-
assert_equal 5, Factbook.codes.oceans.size
|
20
|
-
assert_equal 1, Factbook.codes.world.size
|
21
|
-
assert_equal 2, Factbook.codes.others.size
|
22
|
-
assert_equal 6, Factbook.codes.misc.size
|
23
|
-
|
24
|
-
assert_equal 8, Factbook.codes.dependencies_us.size
|
25
|
-
|
26
|
-
|
27
|
-
assert_equal 55, Factbook.codes.europe.size
|
28
|
-
assert_equal 9, Factbook.codes.south_asia.size
|
29
|
-
assert_equal 6, Factbook.codes.central_asia.size
|
30
|
-
assert_equal 22, Factbook.codes.east_n_souteast_asia.size
|
31
|
-
assert_equal 19, Factbook.codes.middle_east.size
|
32
|
-
assert_equal 56, Factbook.codes.africa.size
|
33
|
-
assert_equal 7, Factbook.codes.north_america.size
|
34
|
-
assert_equal 33, Factbook.codes.central_america_n_caribbean.size
|
35
|
-
assert_equal 14, Factbook.codes.south_america.size
|
36
|
-
assert_equal 30, Factbook.codes.australia_oceania.size
|
37
|
-
assert_equal 4, Factbook.codes.antartica.size
|
38
|
-
assert_equal 5, Factbook.codes.region('Oceans').size
|
39
|
-
assert_equal 1, Factbook.codes.region('World').size
|
40
|
-
|
41
|
-
assert_equal 45, Factbook.codes.countries.europe.size
|
42
|
-
|
43
|
-
assert_equal Factbook.codes.category('Oceans').size, Factbook.codes.region('Oceans').size
|
44
|
-
assert_equal Factbook.codes.category('World').size, Factbook.codes.region('World').size
|
45
|
-
|
46
|
-
|
47
|
-
assert_equal 261, Factbook.codes.countries.size +
|
48
|
-
Factbook.codes.others.size +
|
49
|
-
Factbook.codes.dependencies.size +
|
50
|
-
Factbook.codes.misc.size +
|
51
|
-
Factbook.codes.oceans.size +
|
52
|
-
Factbook.codes.world.size
|
53
|
-
|
54
|
-
assert_equal 261, Factbook.codes.europe.size +
|
55
|
-
Factbook.codes.south_asia.size +
|
56
|
-
Factbook.codes.central_asia.size +
|
57
|
-
Factbook.codes.east_n_souteast_asia.size +
|
58
|
-
Factbook.codes.middle_east.size +
|
59
|
-
Factbook.codes.africa.size +
|
60
|
-
Factbook.codes.north_america.size +
|
61
|
-
Factbook.codes.central_america_n_caribbean.size +
|
62
|
-
Factbook.codes.south_america.size +
|
63
|
-
Factbook.codes.australia_oceania.size +
|
64
|
-
Factbook.codes.antartica.size +
|
65
|
-
Factbook.codes.region('Oceans').size +
|
66
|
-
Factbook.codes.region('World').size
|
67
|
-
|
68
|
-
end
|
69
|
-
|
70
|
-
end # class TestCodes
|
71
|
-
|
72
|
-
|
data/test/test_comparisons.rb
DELETED
@@ -1,16 +0,0 @@
|
|
1
|
-
###
|
2
|
-
# to run use
|
3
|
-
# ruby -I ./lib -I ./test test/test_comparisons.rb
|
4
|
-
|
5
|
-
|
6
|
-
require 'helper'
|
7
|
-
|
8
|
-
|
9
|
-
class TestComparisons < MiniTest::Test
|
10
|
-
|
11
|
-
def test_comparisons
|
12
|
-
assert_equal 74, Factbook.comparisons.size
|
13
|
-
assert_equal 74, Factbook.comparisons.to_a.size
|
14
|
-
end
|
15
|
-
|
16
|
-
end # class TestComparisons
|
data/test/test_item_builder.rb
DELETED
@@ -1,97 +0,0 @@
|
|
1
|
-
###
|
2
|
-
# to run use
|
3
|
-
# ruby -I ./lib -I ./test test/test_item_builder.rb
|
4
|
-
|
5
|
-
|
6
|
-
require 'helper'
|
7
|
-
|
8
|
-
|
9
|
-
class TestItemBuilder < MiniTest::Test
|
10
|
-
|
11
|
-
def test_location
|
12
|
-
|
13
|
-
html =<<EOS
|
14
|
-
<div class=category_data>Central Europe, north of Italy and Slovenia</div>
|
15
|
-
EOS
|
16
|
-
|
17
|
-
b = Factbook::ItemBuilder.new( html, 'Location' )
|
18
|
-
b.read
|
19
|
-
|
20
|
-
assert true ## assume everthing ok
|
21
|
-
end
|
22
|
-
|
23
|
-
def test_area
|
24
|
-
html =<<EOS
|
25
|
-
<div><span class=category>total: </span><span class=category_data>83,871 sq km</span></div>
|
26
|
-
<div><span class=category>land: </span><span class=category_data>82,445 sq km</span></div>
|
27
|
-
<div><span class=category>water: </span><span class=category_data>1,426 sq km</span></div>
|
28
|
-
EOS
|
29
|
-
|
30
|
-
b = Factbook::ItemBuilder.new( html, 'Area' )
|
31
|
-
b.read
|
32
|
-
|
33
|
-
assert true ## assume everthing ok
|
34
|
-
end
|
35
|
-
|
36
|
-
def test_land_use
|
37
|
-
html =<<EOS
|
38
|
-
<div><span class=category>agricultural land: </span><span class=category_data>38.4%</span></div>
|
39
|
-
<div class=category_data>arable land 16.5%; permanent crops 0.8%; permanent pasture 21.1%</div>
|
40
|
-
<div><span class=category>forest: </span><span class=category_data>47.2%</span></div>
|
41
|
-
<div><span class=category>other: </span><span class=category_data>14.4% (2011 est.)</span></div>
|
42
|
-
EOS
|
43
|
-
|
44
|
-
b = Factbook::ItemBuilder.new( html, 'Land use' )
|
45
|
-
b.read
|
46
|
-
|
47
|
-
assert true ## assume everthing ok
|
48
|
-
end
|
49
|
-
|
50
|
-
def test_contraceptive_prevalence_rate
|
51
|
-
html =<<EOS
|
52
|
-
<div class=category_data>69.6%</div>
|
53
|
-
<div><span class=category>note: </span><span class=category_data>percent of women aged 18-46 (2008/09)</span></div>
|
54
|
-
EOS
|
55
|
-
|
56
|
-
b = Factbook::ItemBuilder.new( html, 'Contraceptive Prevalence Rate' )
|
57
|
-
b.read
|
58
|
-
|
59
|
-
assert true ## assume everthing ok
|
60
|
-
end
|
61
|
-
|
62
|
-
def test_drinking_water_source
|
63
|
-
html =<<EOS
|
64
|
-
<div><span class=category>improved: </span><span class=category_data></span></div>
|
65
|
-
<div class=category_data>urban: 100% of population</div>
|
66
|
-
<div class=category_data>rural: 100% of population</div>
|
67
|
-
<div class=category_data>total: 100% of population</div>
|
68
|
-
<div><span class=category>unimproved: </span><span class=category_data></span></div>
|
69
|
-
<div class=category_data>urban: 0% of population</div>
|
70
|
-
<div class=category_data>rural: 0% of population</div>
|
71
|
-
<div class=category_data>total: 0% of population (2015 est.)</div>
|
72
|
-
EOS
|
73
|
-
|
74
|
-
b = Factbook::ItemBuilder.new( html, 'Drinking Water Source' )
|
75
|
-
b.read
|
76
|
-
|
77
|
-
assert true ## assume everthing ok
|
78
|
-
end
|
79
|
-
|
80
|
-
def test_political_pressure_groups_and_leaders
|
81
|
-
html =<<EOS
|
82
|
-
<div class=category_data>Austrian Trade Union Federation or OeGB (nominally independent but primarily Social Democratic)</div>
|
83
|
-
<div class=category_data>Federal Economic Chamber (OeVP-dominated)</div>
|
84
|
-
<div class=category_data>Labor Chamber or AK (Social Democratic-leaning think tank)</div>
|
85
|
-
<div class=category_data>OeVP-oriented Association of Austrian Industrialists or IV</div>
|
86
|
-
<div class=category_data>Roman Catholic Church, including its chief lay organization, Catholic Action</div>
|
87
|
-
<div><span class=category>other: </span><span class=category_data>three composite leagues of the Austrian People's Party or OeVP representing business, labor, farmers, and other nongovernment organizations in the areas of environment and human rights</span></div>
|
88
|
-
EOS
|
89
|
-
|
90
|
-
b = Factbook::ItemBuilder.new( html, 'Political pressure groups and leaders' )
|
91
|
-
b.read
|
92
|
-
|
93
|
-
assert true ## assume everthing ok
|
94
|
-
end
|
95
|
-
|
96
|
-
end # class TestItemBuilder
|
97
|
-
|