factbook 0.1.3 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/Manifest.txt +34 -22
- data/README.md +8 -3
- data/Rakefile +2 -263
- data/data/codes.csv +262 -0
- data/data/comparisons.csv +75 -0
- data/lib/factbook/builder.rb +214 -0
- data/lib/factbook/builder_item.rb +93 -0
- data/lib/factbook/codes.rb +119 -0
- data/lib/factbook/comparisons.rb +50 -0
- data/lib/factbook/page.rb +103 -303
- data/lib/factbook/sanitizer.rb +214 -0
- data/lib/factbook/sect.rb +29 -196
- data/lib/factbook/subsect.rb +18 -0
- data/lib/factbook/table.rb +52 -0
- data/lib/factbook/utils.rb +85 -0
- data/lib/factbook/utils_info.rb +102 -0
- data/lib/factbook/version.rb +4 -3
- data/lib/factbook.rb +23 -1
- data/test/data/au.html +579 -0
- data/test/data/au.yml +8 -0
- data/test/data/be.html +596 -0
- data/test/data/be.yml +8 -0
- data/test/data/src/au.html +2006 -0
- data/test/data/src/be.html +2011 -0
- data/test/helper.rb +0 -4
- data/test/test_builder.rb +37 -0
- data/test/test_codes.rb +76 -0
- data/test/test_comparisons.rb +19 -0
- data/test/test_fields.rb +21 -18
- data/test/test_item_builder.rb +99 -0
- data/test/test_json.rb +17 -20
- data/test/test_page.rb +18 -10
- data/test/test_sanitizer.rb +35 -0
- metadata +68 -49
- data/.gemtest +0 -0
- data/test/data/countrytemplate_au.html +0 -4179
- data/test/data/countrytemplate_be.html +0 -4260
- data/test/data/countrytemplate_br.html +0 -4366
- data/test/data/countrytemplate_ee.html +0 -2999
- data/test/data/countrytemplate_ls.html +0 -2728
- data/test/data/countrytemplate_mx.html +0 -4397
- data/test/data/countrytemplate_vt.html +0 -1726
- data/test/data/countrytemplate_xx.html +0 -2898
- data/test/test_page_old.rb +0 -478
- data/test/test_strip.rb +0 -66
data/lib/factbook/sect.rb
CHANGED
@@ -1,196 +1,29 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
|
-
module Factbook
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
def cleanup_key( key )
|
32
|
-
|
33
|
-
if @opts[:fields] # if set assume full|long|keep for now
|
34
|
-
### kepe field names as is
|
35
|
-
## e.g.
|
36
|
-
## GDP - composition, by sector of origin:
|
37
|
-
## Budget surplus (+) or deficit (-):
|
38
|
-
## becomes:
|
39
|
-
## GDP - composition, by sector of origin
|
40
|
-
## Budget surplus (+) or deficit (-)
|
41
|
-
key = key.strip
|
42
|
-
key = key.gsub( /[ ]{2,}/, ' ' ) # fold two plus spaces into one -- check if exists?
|
43
|
-
key = key.gsub( /:\z/, '' ) # remove trailing : if present
|
44
|
-
key = key.strip
|
45
|
-
else
|
46
|
-
## to lower case
|
47
|
-
key = key.downcase
|
48
|
-
## seaport(s) => seaports
|
49
|
-
key = key.gsub( '(s)', 's' )
|
50
|
-
key = key.gsub( ':', '' ) # trailing : ## fix: use regex /:$/ w/ anchor??
|
51
|
-
## remove special chars ()+-/,'
|
52
|
-
key = key.gsub( /['()+\-\/,]/, ' ' )
|
53
|
-
key = key.strip
|
54
|
-
key = key.gsub( /[ ]+/, '_' )
|
55
|
-
end
|
56
|
-
|
57
|
-
key
|
58
|
-
end
|
59
|
-
|
60
|
-
|
61
|
-
def sect_to_hash( sect )
|
62
|
-
|
63
|
-
rows = sect.css( 'table tr' )
|
64
|
-
cells = sect.css( 'table tr td' )
|
65
|
-
field_ids = rows.css( '#field' ) ## check - use div#field.category -- possible?
|
66
|
-
data_ids = rows.css( '#data' )
|
67
|
-
|
68
|
-
logger.debug "rows.size: #{rows.size} (cells.size: #{cells.size} / field_ids.size: #{field_ids.size} / data_ids.size: #{data_ids.size})"
|
69
|
-
|
70
|
-
hash = {}
|
71
|
-
last_cat = nil
|
72
|
-
|
73
|
-
cells.each_with_index do |cell,i|
|
74
|
-
## next if i > 14 ## skip after xx for debugging for now
|
75
|
-
|
76
|
-
# check if field or data id
|
77
|
-
# check for (nested) div#field in td
|
78
|
-
has_field_id = cell.css( '#field' ).size == 1 ? true : false
|
79
|
-
|
80
|
-
# check for td#data
|
81
|
-
has_data_id = cell['id'] == 'data' ? true : false
|
82
|
-
|
83
|
-
if has_field_id
|
84
|
-
|
85
|
-
cats = cell.css( 'div.category' ) ## note: ignore all .category not using div (issue warn/err if found!!) etc.
|
86
|
-
if cats.size == 1
|
87
|
-
text = cleanup_key( cats.first.text.strip ) # remove/strip leading and trailing spaces
|
88
|
-
last_cat = text
|
89
|
-
logger.debug " [#{i}] category: >>#{text}<<"
|
90
|
-
else
|
91
|
-
logger.warn "**** !!!!!! warn/err - found element w/ field id (no match for subsection!!! - check)"
|
92
|
-
logger.warn cell.to_s
|
93
|
-
end
|
94
|
-
|
95
|
-
elsif has_data_id
|
96
|
-
|
97
|
-
cats = cell.css( 'div.category' ) ## note: ignore all .category not using div (issue warn/err if found!!) etc.
|
98
|
-
cats_data = cell.css( 'div.category_data,span.category_data' ) ## note: ignore a.category_data etc.
|
99
|
-
cats_div_data = cell.css( 'div.category_data' )
|
100
|
-
cats_span_data = cell.css( 'span.category_data' )
|
101
|
-
|
102
|
-
logger.debug " - [#{i}] data cell - cats: #{cats.size}, cats_data: #{cats_data.size} (cats_div_data: #{cats_div_data.size} / cats_span_data: #{cats_span_data.size})"
|
103
|
-
|
104
|
-
pairs = []
|
105
|
-
last_pair = nil
|
106
|
-
last_pair_data_count = 0
|
107
|
-
|
108
|
-
## loop over div blocks (might be .category or .category_data)
|
109
|
-
cell.children.each_with_index do |child,j|
|
110
|
-
unless child.element?
|
111
|
-
## puts " **** !!!! skipping non-element type >#{child.type}<:"
|
112
|
-
## puts child.to_s
|
113
|
-
next
|
114
|
-
end
|
115
|
-
unless child.name == 'div'
|
116
|
-
logger.warn " **** !!! skipping non-div >#{child.name}<:"
|
117
|
-
logger.warn child.to_s
|
118
|
-
next
|
119
|
-
end
|
120
|
-
|
121
|
-
### check if .category or .category_data
|
122
|
-
if child['class'] == 'category'
|
123
|
-
|
124
|
-
## collect text for category; exclude element w/ class.category_data
|
125
|
-
text = ""
|
126
|
-
child.children.each do |subchild|
|
127
|
-
text << subchild.text.strip unless subchild.element? && subchild['class'] == 'category_data'
|
128
|
-
end
|
129
|
-
text = cleanup_key( text )
|
130
|
-
|
131
|
-
value = child.css('span.category_data').text.strip
|
132
|
-
|
133
|
-
logger.debug " -- category >>#{text}<<"
|
134
|
-
|
135
|
-
## start new pair
|
136
|
-
last_pair = [ text, value ]
|
137
|
-
last_pair_data_count = 0
|
138
|
-
pairs << last_pair
|
139
|
-
|
140
|
-
elsif child['class'] == 'category_data'
|
141
|
-
logger.debug " -- category_data"
|
142
|
-
|
143
|
-
text = child.text.strip
|
144
|
-
|
145
|
-
if last_pair.nil?
|
146
|
-
## assume its the very first entry; use implied/auto-created category
|
147
|
-
last_pair = [ 'text', '' ]
|
148
|
-
last_pair_data_count = 0
|
149
|
-
pairs << last_pair
|
150
|
-
end
|
151
|
-
|
152
|
-
### first category_data element?
|
153
|
-
if last_pair_data_count == 0
|
154
|
-
if last_pair[1] == ''
|
155
|
-
last_pair[1] = text
|
156
|
-
else
|
157
|
-
last_pair[1] += " #{text}" ## append w/o separator
|
158
|
-
end
|
159
|
-
else
|
160
|
-
if last_cat == 'demographic_profile' || last_cat == 'Demographic profile' ## special case (use space a sep)
|
161
|
-
last_pair[1] += " #{text}" ## append with separator
|
162
|
-
else
|
163
|
-
last_pair[1] += "; #{text}" ## append with separator
|
164
|
-
end
|
165
|
-
end
|
166
|
-
last_pair_data_count += 1
|
167
|
-
|
168
|
-
else
|
169
|
-
logger.warn " **** !!! skipping div w/o category or category_data class:"
|
170
|
-
logger.warn child.to_s
|
171
|
-
end
|
172
|
-
end
|
173
|
-
|
174
|
-
## pp pairs
|
175
|
-
|
176
|
-
## pairs to hash
|
177
|
-
pairs_hash = {}
|
178
|
-
pairs.each do |pair|
|
179
|
-
pairs_hash[ pair[0] ] = pair[1]
|
180
|
-
end
|
181
|
-
|
182
|
-
hash[ last_cat ] = pairs_hash
|
183
|
-
|
184
|
-
else
|
185
|
-
logger.warn "#### !!!! unknown cell type (no field or data id found):"
|
186
|
-
logger.warn cell.to_s
|
187
|
-
end
|
188
|
-
end # each cell
|
189
|
-
|
190
|
-
hash # return hash
|
191
|
-
|
192
|
-
end # method sect_to_hash
|
193
|
-
|
194
|
-
end # class Sect
|
195
|
-
|
196
|
-
end # module Factbook
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
module Factbook
|
4
|
+
|
5
|
+
|
6
|
+
class Sect
|
7
|
+
include LogUtils::Logging
|
8
|
+
|
9
|
+
attr_accessor :title ## use name instead of title - why? why not?
|
10
|
+
attr_accessor :subsects
|
11
|
+
|
12
|
+
def initialize
|
13
|
+
@subsects = []
|
14
|
+
end
|
15
|
+
|
16
|
+
def data
|
17
|
+
## convert sects to hash
|
18
|
+
@data = {}
|
19
|
+
|
20
|
+
subsects.each_with_index do |subsect,i|
|
21
|
+
@data[ subsect.title ] = subsect.data
|
22
|
+
end
|
23
|
+
@data
|
24
|
+
end
|
25
|
+
|
26
|
+
|
27
|
+
end # class Sect
|
28
|
+
|
29
|
+
end # module Factbook
|
@@ -0,0 +1,18 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
module Factbook
|
4
|
+
|
5
|
+
|
6
|
+
class Subsect
|
7
|
+
include LogUtils::Logging
|
8
|
+
|
9
|
+
attr_accessor :title ## use name instead of title - why? why not?
|
10
|
+
attr_accessor :data ## hash holding data e.g. { 'text' => '...' etc. }
|
11
|
+
|
12
|
+
def initialize
|
13
|
+
@data = {}
|
14
|
+
end
|
15
|
+
|
16
|
+
end # class Subsect
|
17
|
+
|
18
|
+
end # module Factbook
|
@@ -0,0 +1,52 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
module Factbook
|
4
|
+
|
5
|
+
##
|
6
|
+
## make more "generic" - why? why not?
|
7
|
+
## (re)use for other files ?? move to textutils ??
|
8
|
+
|
9
|
+
##
|
10
|
+
## for now reads in rows with values separated by at least 3+ spaces e.g.:
|
11
|
+
## see www.cia.gov/library/publications/the-world-factbook/rankorder/rawdata_2119.txt
|
12
|
+
## 1 China 1,367,485,388
|
13
|
+
## 2 India 1,251,695,584
|
14
|
+
## 3 European Union 513,949,445
|
15
|
+
## 4 United States 321,368,864
|
16
|
+
## 5 Indonesia 255,993,674
|
17
|
+
## 6 Brazil 204,259,812
|
18
|
+
|
19
|
+
|
20
|
+
class TableReader
|
21
|
+
include LogUtils::Logging
|
22
|
+
|
23
|
+
|
24
|
+
def initialize( text )
|
25
|
+
@text = text
|
26
|
+
end
|
27
|
+
|
28
|
+
def read
|
29
|
+
recs = []
|
30
|
+
|
31
|
+
line_no = 0
|
32
|
+
@text.each_line do |line|
|
33
|
+
line_no +=1
|
34
|
+
line = line.strip ## remove leading and trailing whitespace
|
35
|
+
if line.empty?
|
36
|
+
puts "** skipping empty line #{line_no}"
|
37
|
+
next
|
38
|
+
end
|
39
|
+
|
40
|
+
values = line.split( /[ ]{3,}/ ) ## split three or more spaces - use just two ?? why? why not??
|
41
|
+
|
42
|
+
## puts line
|
43
|
+
## pp values
|
44
|
+
recs << values
|
45
|
+
end
|
46
|
+
recs
|
47
|
+
end
|
48
|
+
|
49
|
+
|
50
|
+
end # class TableReader
|
51
|
+
|
52
|
+
end # module Factbook
|
@@ -0,0 +1,85 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
module Factbook
|
4
|
+
module Utils
|
5
|
+
|
6
|
+
########################################
|
7
|
+
## todo: move to textutils - why, why not ?????
|
8
|
+
|
9
|
+
def encode_utf8( text )
|
10
|
+
|
11
|
+
errors = [] ## also return list of encoding errors
|
12
|
+
|
13
|
+
## note: factbook claims utf-8 - but includes invalid bytes in some pages
|
14
|
+
## encoding is likley wester/windows-
|
15
|
+
|
16
|
+
## note:
|
17
|
+
## use � - unknown/invalid unicode char
|
18
|
+
## fix/todo: use ASCII-8BIT instead of binnary
|
19
|
+
text = text.encode('UTF-8', 'binary', :invalid => :replace,
|
20
|
+
:undef => :replace,
|
21
|
+
:replace => '�' )
|
22
|
+
|
23
|
+
## check for replaced/invalid chars and log warrning
|
24
|
+
pos = text.index( '�' )
|
25
|
+
while pos
|
26
|
+
from = pos-10 ## tood/fix: use min/max to check for bounds - why? why not??
|
27
|
+
to = pos+10
|
28
|
+
around = text[from..to]
|
29
|
+
puts " pos #{pos}, from #{from}, to #{to}, around >#{around}<"
|
30
|
+
msg = "invalid char on pos #{pos} around: >#{around}<"
|
31
|
+
puts msg
|
32
|
+
## also log message / w timestamp
|
33
|
+
|
34
|
+
errors << "#{Time.now} - #{msg}"
|
35
|
+
|
36
|
+
pos = text.index( '�', pos+1 )
|
37
|
+
end
|
38
|
+
|
39
|
+
[text,errors] ## return text and errors (list)
|
40
|
+
end
|
41
|
+
|
42
|
+
|
43
|
+
|
44
|
+
def values_to_csv( values )
|
45
|
+
buf = ""
|
46
|
+
values.each_with_index do |value,i|
|
47
|
+
buf << ',' if i > 0 ## add comma (except for first value)
|
48
|
+
## note: allow optional $ sign e.g. $100,000,000
|
49
|
+
## !!!! todo/fix: allow optional minus e.g. -44,000
|
50
|
+
if value =~ /^\$?[1-9][,0-9]+[0-9]$/ ### find a better regex - why? why not??
|
51
|
+
## check if number e.g. 17,098,242 or $17,098,242
|
52
|
+
## remove commas 17098242
|
53
|
+
buf << value.gsub( ',', '' )
|
54
|
+
elsif value.index( ',').nil?
|
55
|
+
## add as is 1:1 (no commana)
|
56
|
+
buf << value
|
57
|
+
else
|
58
|
+
## escape comma with double quote
|
59
|
+
# e.g. Guam, The becomes "Guam, The"
|
60
|
+
buf << '"'
|
61
|
+
buf << value
|
62
|
+
buf << '"'
|
63
|
+
end
|
64
|
+
end
|
65
|
+
buf
|
66
|
+
end
|
67
|
+
|
68
|
+
|
69
|
+
def data_to_csv( recs, headers )
|
70
|
+
text = ""
|
71
|
+
|
72
|
+
text << values_to_csv( headers )
|
73
|
+
text << "\n"
|
74
|
+
|
75
|
+
recs.each do |rec|
|
76
|
+
text << values_to_csv( rec )
|
77
|
+
text << "\n"
|
78
|
+
end
|
79
|
+
|
80
|
+
text
|
81
|
+
end
|
82
|
+
|
83
|
+
|
84
|
+
end # module Utils
|
85
|
+
end # module Factbook
|
@@ -0,0 +1,102 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
module Factbook
|
4
|
+
module Utils
|
5
|
+
|
6
|
+
#######
|
7
|
+
## find meta data (about page info)
|
8
|
+
|
9
|
+
|
10
|
+
#### e.g. Page last updated on September 16, 2015
|
11
|
+
|
12
|
+
MONTH_EN_TO_S={
|
13
|
+
'January' => '1',
|
14
|
+
'February' => '2',
|
15
|
+
'March' => '3',
|
16
|
+
'April' => '4',
|
17
|
+
'May' => '5',
|
18
|
+
'June' => '6',
|
19
|
+
'July' => '7',
|
20
|
+
'August' => '8',
|
21
|
+
'September' => '9',
|
22
|
+
'October' => '10',
|
23
|
+
'November' => '11',
|
24
|
+
'December' => '12'
|
25
|
+
}
|
26
|
+
|
27
|
+
PAGE_LAST_UPDATED_REGEX = /
|
28
|
+
Page \s last \s updated \s on \s
|
29
|
+
(?<month_en>[a-z]+) \s
|
30
|
+
(?<day>\d{1,2}), \s
|
31
|
+
(?<year>\d{4})
|
32
|
+
/imx
|
33
|
+
|
34
|
+
def find_page_last_updated( html )
|
35
|
+
m = PAGE_LAST_UPDATED_REGEX.match( html )
|
36
|
+
if m
|
37
|
+
pp m
|
38
|
+
month_en = m[:month_en]
|
39
|
+
day = m[:day]
|
40
|
+
year = m[:year]
|
41
|
+
puts "** bingo - month #{month_en}, day #{day}, year #{year}"
|
42
|
+
|
43
|
+
month = MONTH_EN_TO_S[ month_en ]
|
44
|
+
date_str = "#{year}-#{month}-#{day}"
|
45
|
+
pp date_str
|
46
|
+
date = Date.strptime( date_str, '%Y-%m-%d' )
|
47
|
+
date
|
48
|
+
else
|
49
|
+
nil
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
##
|
54
|
+
## e.g. regioncode="eur"
|
55
|
+
## countrycode="au"
|
56
|
+
## countryname="Austria"
|
57
|
+
## flagsubfield=""
|
58
|
+
## countryaffiliation=""
|
59
|
+
## flagdescription=""
|
60
|
+
## flagdescriptionnote=""
|
61
|
+
## region="Europe"
|
62
|
+
##
|
63
|
+
## note: countryaffiliation may be empty
|
64
|
+
|
65
|
+
|
66
|
+
|
67
|
+
PAGE_INFO_REGEX = /
|
68
|
+
regioncode=(?<q1>"|')(?<region_code>.+?)\k<q1>
|
69
|
+
\s+
|
70
|
+
countrycode=(?<q2>"|')(?<country_code>.+?)\k<q2> ## is k<3> backref
|
71
|
+
\s+
|
72
|
+
countryname=(?<q3>"|')(?<country>.+?)\k<q3>
|
73
|
+
\s+
|
74
|
+
[^>]+? ## allow any attribs (note: non-greedy)
|
75
|
+
countryaffiliation=(?<q4>"|')(?<affiliation>.*?)\k<q4> ## note: might be empty
|
76
|
+
\s+
|
77
|
+
[^>]+? ## allow any attribs (note: non-greedy)
|
78
|
+
region=(?<q5>"|')(?<region>.+?)\k<q5> ## check world - might be empty ?? or for ocean ??
|
79
|
+
/imx
|
80
|
+
|
81
|
+
|
82
|
+
def find_page_info( html )
|
83
|
+
m = PAGE_INFO_REGEX.match( html )
|
84
|
+
if m
|
85
|
+
pp m
|
86
|
+
|
87
|
+
h = { country_code: m[:country_code],
|
88
|
+
country_name: m[:country],
|
89
|
+
country_affiliation: m[:affiliation],
|
90
|
+
region_code: m[:region_code],
|
91
|
+
region_name: m[:region] }
|
92
|
+
|
93
|
+
puts "** bingo - #{h.inspect}"
|
94
|
+
h ## return hash w/ name-value pairs
|
95
|
+
else
|
96
|
+
nil ## or return empty struct with nils/empty strings - why?? why not??
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
100
|
+
|
101
|
+
end # module Utils
|
102
|
+
end # module Factbook
|
data/lib/factbook/version.rb
CHANGED
data/lib/factbook.rb
CHANGED
@@ -7,6 +7,7 @@ require 'uri'
|
|
7
7
|
require 'cgi'
|
8
8
|
require 'pp'
|
9
9
|
require 'json'
|
10
|
+
require 'csv'
|
10
11
|
require 'fileutils'
|
11
12
|
|
12
13
|
|
@@ -21,11 +22,32 @@ require 'nokogiri'
|
|
21
22
|
# our own code
|
22
23
|
|
23
24
|
require 'factbook/version' # let it always go first
|
25
|
+
require 'factbook/utils'
|
26
|
+
require 'factbook/utils_info'
|
27
|
+
require 'factbook/sanitizer'
|
28
|
+
require 'factbook/builder_item'
|
29
|
+
require 'factbook/builder'
|
24
30
|
require 'factbook/page'
|
25
31
|
require 'factbook/sect'
|
32
|
+
require 'factbook/subsect'
|
26
33
|
|
34
|
+
require 'factbook/codes'
|
35
|
+
require 'factbook/comparisons'
|
27
36
|
|
37
|
+
require 'factbook/table' ## e.g. TableReader
|
28
38
|
|
29
39
|
|
30
|
-
puts Factbook.banner
|
31
40
|
|
41
|
+
module Factbook
|
42
|
+
|
43
|
+
## auto-load builtin codes and comparisons
|
44
|
+
CODES = Codes.from_csv( "#{Factbook.root}/data/codes.csv" )
|
45
|
+
COMPARISONS = Comparisons.from_csv( "#{Factbook.root}/data/comparisons.csv")
|
46
|
+
|
47
|
+
def self.codes() CODES; end
|
48
|
+
def self.comparisons() COMPARISONS; end
|
49
|
+
|
50
|
+
end # module Factbook
|
51
|
+
|
52
|
+
|
53
|
+
puts Factbook.banner if defined?($RUBYLIBS_DEBUG) && $RUBYLIBS_DEBUG
|