factbook 0.1.3 → 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/Manifest.txt +34 -22
- data/README.md +8 -3
- data/Rakefile +2 -263
- data/data/codes.csv +262 -0
- data/data/comparisons.csv +75 -0
- data/lib/factbook/builder.rb +214 -0
- data/lib/factbook/builder_item.rb +93 -0
- data/lib/factbook/codes.rb +119 -0
- data/lib/factbook/comparisons.rb +50 -0
- data/lib/factbook/page.rb +103 -303
- data/lib/factbook/sanitizer.rb +214 -0
- data/lib/factbook/sect.rb +29 -196
- data/lib/factbook/subsect.rb +18 -0
- data/lib/factbook/table.rb +52 -0
- data/lib/factbook/utils.rb +85 -0
- data/lib/factbook/utils_info.rb +102 -0
- data/lib/factbook/version.rb +4 -3
- data/lib/factbook.rb +23 -1
- data/test/data/au.html +579 -0
- data/test/data/au.yml +8 -0
- data/test/data/be.html +596 -0
- data/test/data/be.yml +8 -0
- data/test/data/src/au.html +2006 -0
- data/test/data/src/be.html +2011 -0
- data/test/helper.rb +0 -4
- data/test/test_builder.rb +37 -0
- data/test/test_codes.rb +76 -0
- data/test/test_comparisons.rb +19 -0
- data/test/test_fields.rb +21 -18
- data/test/test_item_builder.rb +99 -0
- data/test/test_json.rb +17 -20
- data/test/test_page.rb +18 -10
- data/test/test_sanitizer.rb +35 -0
- metadata +68 -49
- data/.gemtest +0 -0
- data/test/data/countrytemplate_au.html +0 -4179
- data/test/data/countrytemplate_be.html +0 -4260
- data/test/data/countrytemplate_br.html +0 -4366
- data/test/data/countrytemplate_ee.html +0 -2999
- data/test/data/countrytemplate_ls.html +0 -2728
- data/test/data/countrytemplate_mx.html +0 -4397
- data/test/data/countrytemplate_vt.html +0 -1726
- data/test/data/countrytemplate_xx.html +0 -2898
- data/test/test_page_old.rb +0 -478
- data/test/test_strip.rb +0 -66
data/lib/factbook/sect.rb
CHANGED
@@ -1,196 +1,29 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
|
-
module Factbook
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
def cleanup_key( key )
|
32
|
-
|
33
|
-
if @opts[:fields] # if set assume full|long|keep for now
|
34
|
-
### kepe field names as is
|
35
|
-
## e.g.
|
36
|
-
## GDP - composition, by sector of origin:
|
37
|
-
## Budget surplus (+) or deficit (-):
|
38
|
-
## becomes:
|
39
|
-
## GDP - composition, by sector of origin
|
40
|
-
## Budget surplus (+) or deficit (-)
|
41
|
-
key = key.strip
|
42
|
-
key = key.gsub( /[ ]{2,}/, ' ' ) # fold two plus spaces into one -- check if exists?
|
43
|
-
key = key.gsub( /:\z/, '' ) # remove trailing : if present
|
44
|
-
key = key.strip
|
45
|
-
else
|
46
|
-
## to lower case
|
47
|
-
key = key.downcase
|
48
|
-
## seaport(s) => seaports
|
49
|
-
key = key.gsub( '(s)', 's' )
|
50
|
-
key = key.gsub( ':', '' ) # trailing : ## fix: use regex /:$/ w/ anchor??
|
51
|
-
## remove special chars ()+-/,'
|
52
|
-
key = key.gsub( /['()+\-\/,]/, ' ' )
|
53
|
-
key = key.strip
|
54
|
-
key = key.gsub( /[ ]+/, '_' )
|
55
|
-
end
|
56
|
-
|
57
|
-
key
|
58
|
-
end
|
59
|
-
|
60
|
-
|
61
|
-
def sect_to_hash( sect )
|
62
|
-
|
63
|
-
rows = sect.css( 'table tr' )
|
64
|
-
cells = sect.css( 'table tr td' )
|
65
|
-
field_ids = rows.css( '#field' ) ## check - use div#field.category -- possible?
|
66
|
-
data_ids = rows.css( '#data' )
|
67
|
-
|
68
|
-
logger.debug "rows.size: #{rows.size} (cells.size: #{cells.size} / field_ids.size: #{field_ids.size} / data_ids.size: #{data_ids.size})"
|
69
|
-
|
70
|
-
hash = {}
|
71
|
-
last_cat = nil
|
72
|
-
|
73
|
-
cells.each_with_index do |cell,i|
|
74
|
-
## next if i > 14 ## skip after xx for debugging for now
|
75
|
-
|
76
|
-
# check if field or data id
|
77
|
-
# check for (nested) div#field in td
|
78
|
-
has_field_id = cell.css( '#field' ).size == 1 ? true : false
|
79
|
-
|
80
|
-
# check for td#data
|
81
|
-
has_data_id = cell['id'] == 'data' ? true : false
|
82
|
-
|
83
|
-
if has_field_id
|
84
|
-
|
85
|
-
cats = cell.css( 'div.category' ) ## note: ignore all .category not using div (issue warn/err if found!!) etc.
|
86
|
-
if cats.size == 1
|
87
|
-
text = cleanup_key( cats.first.text.strip ) # remove/strip leading and trailing spaces
|
88
|
-
last_cat = text
|
89
|
-
logger.debug " [#{i}] category: >>#{text}<<"
|
90
|
-
else
|
91
|
-
logger.warn "**** !!!!!! warn/err - found element w/ field id (no match for subsection!!! - check)"
|
92
|
-
logger.warn cell.to_s
|
93
|
-
end
|
94
|
-
|
95
|
-
elsif has_data_id
|
96
|
-
|
97
|
-
cats = cell.css( 'div.category' ) ## note: ignore all .category not using div (issue warn/err if found!!) etc.
|
98
|
-
cats_data = cell.css( 'div.category_data,span.category_data' ) ## note: ignore a.category_data etc.
|
99
|
-
cats_div_data = cell.css( 'div.category_data' )
|
100
|
-
cats_span_data = cell.css( 'span.category_data' )
|
101
|
-
|
102
|
-
logger.debug " - [#{i}] data cell - cats: #{cats.size}, cats_data: #{cats_data.size} (cats_div_data: #{cats_div_data.size} / cats_span_data: #{cats_span_data.size})"
|
103
|
-
|
104
|
-
pairs = []
|
105
|
-
last_pair = nil
|
106
|
-
last_pair_data_count = 0
|
107
|
-
|
108
|
-
## loop over div blocks (might be .category or .category_data)
|
109
|
-
cell.children.each_with_index do |child,j|
|
110
|
-
unless child.element?
|
111
|
-
## puts " **** !!!! skipping non-element type >#{child.type}<:"
|
112
|
-
## puts child.to_s
|
113
|
-
next
|
114
|
-
end
|
115
|
-
unless child.name == 'div'
|
116
|
-
logger.warn " **** !!! skipping non-div >#{child.name}<:"
|
117
|
-
logger.warn child.to_s
|
118
|
-
next
|
119
|
-
end
|
120
|
-
|
121
|
-
### check if .category or .category_data
|
122
|
-
if child['class'] == 'category'
|
123
|
-
|
124
|
-
## collect text for category; exclude element w/ class.category_data
|
125
|
-
text = ""
|
126
|
-
child.children.each do |subchild|
|
127
|
-
text << subchild.text.strip unless subchild.element? && subchild['class'] == 'category_data'
|
128
|
-
end
|
129
|
-
text = cleanup_key( text )
|
130
|
-
|
131
|
-
value = child.css('span.category_data').text.strip
|
132
|
-
|
133
|
-
logger.debug " -- category >>#{text}<<"
|
134
|
-
|
135
|
-
## start new pair
|
136
|
-
last_pair = [ text, value ]
|
137
|
-
last_pair_data_count = 0
|
138
|
-
pairs << last_pair
|
139
|
-
|
140
|
-
elsif child['class'] == 'category_data'
|
141
|
-
logger.debug " -- category_data"
|
142
|
-
|
143
|
-
text = child.text.strip
|
144
|
-
|
145
|
-
if last_pair.nil?
|
146
|
-
## assume its the very first entry; use implied/auto-created category
|
147
|
-
last_pair = [ 'text', '' ]
|
148
|
-
last_pair_data_count = 0
|
149
|
-
pairs << last_pair
|
150
|
-
end
|
151
|
-
|
152
|
-
### first category_data element?
|
153
|
-
if last_pair_data_count == 0
|
154
|
-
if last_pair[1] == ''
|
155
|
-
last_pair[1] = text
|
156
|
-
else
|
157
|
-
last_pair[1] += " #{text}" ## append w/o separator
|
158
|
-
end
|
159
|
-
else
|
160
|
-
if last_cat == 'demographic_profile' || last_cat == 'Demographic profile' ## special case (use space a sep)
|
161
|
-
last_pair[1] += " #{text}" ## append with separator
|
162
|
-
else
|
163
|
-
last_pair[1] += "; #{text}" ## append with separator
|
164
|
-
end
|
165
|
-
end
|
166
|
-
last_pair_data_count += 1
|
167
|
-
|
168
|
-
else
|
169
|
-
logger.warn " **** !!! skipping div w/o category or category_data class:"
|
170
|
-
logger.warn child.to_s
|
171
|
-
end
|
172
|
-
end
|
173
|
-
|
174
|
-
## pp pairs
|
175
|
-
|
176
|
-
## pairs to hash
|
177
|
-
pairs_hash = {}
|
178
|
-
pairs.each do |pair|
|
179
|
-
pairs_hash[ pair[0] ] = pair[1]
|
180
|
-
end
|
181
|
-
|
182
|
-
hash[ last_cat ] = pairs_hash
|
183
|
-
|
184
|
-
else
|
185
|
-
logger.warn "#### !!!! unknown cell type (no field or data id found):"
|
186
|
-
logger.warn cell.to_s
|
187
|
-
end
|
188
|
-
end # each cell
|
189
|
-
|
190
|
-
hash # return hash
|
191
|
-
|
192
|
-
end # method sect_to_hash
|
193
|
-
|
194
|
-
end # class Sect
|
195
|
-
|
196
|
-
end # module Factbook
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
module Factbook
|
4
|
+
|
5
|
+
|
6
|
+
class Sect
|
7
|
+
include LogUtils::Logging
|
8
|
+
|
9
|
+
attr_accessor :title ## use name instead of title - why? why not?
|
10
|
+
attr_accessor :subsects
|
11
|
+
|
12
|
+
def initialize
|
13
|
+
@subsects = []
|
14
|
+
end
|
15
|
+
|
16
|
+
def data
|
17
|
+
## convert sects to hash
|
18
|
+
@data = {}
|
19
|
+
|
20
|
+
subsects.each_with_index do |subsect,i|
|
21
|
+
@data[ subsect.title ] = subsect.data
|
22
|
+
end
|
23
|
+
@data
|
24
|
+
end
|
25
|
+
|
26
|
+
|
27
|
+
end # class Sect
|
28
|
+
|
29
|
+
end # module Factbook
|
@@ -0,0 +1,18 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
module Factbook
|
4
|
+
|
5
|
+
|
6
|
+
class Subsect
|
7
|
+
include LogUtils::Logging
|
8
|
+
|
9
|
+
attr_accessor :title ## use name instead of title - why? why not?
|
10
|
+
attr_accessor :data ## hash holding data e.g. { 'text' => '...' etc. }
|
11
|
+
|
12
|
+
def initialize
|
13
|
+
@data = {}
|
14
|
+
end
|
15
|
+
|
16
|
+
end # class Subsect
|
17
|
+
|
18
|
+
end # module Factbook
|
@@ -0,0 +1,52 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
module Factbook
|
4
|
+
|
5
|
+
##
|
6
|
+
## make more "generic" - why? why not?
|
7
|
+
## (re)use for other files ?? move to textutils ??
|
8
|
+
|
9
|
+
##
|
10
|
+
## for now reads in rows with values separated by at least 3+ spaces e.g.:
|
11
|
+
## see www.cia.gov/library/publications/the-world-factbook/rankorder/rawdata_2119.txt
|
12
|
+
## 1 China 1,367,485,388
|
13
|
+
## 2 India 1,251,695,584
|
14
|
+
## 3 European Union 513,949,445
|
15
|
+
## 4 United States 321,368,864
|
16
|
+
## 5 Indonesia 255,993,674
|
17
|
+
## 6 Brazil 204,259,812
|
18
|
+
|
19
|
+
|
20
|
+
class TableReader
|
21
|
+
include LogUtils::Logging
|
22
|
+
|
23
|
+
|
24
|
+
def initialize( text )
|
25
|
+
@text = text
|
26
|
+
end
|
27
|
+
|
28
|
+
def read
|
29
|
+
recs = []
|
30
|
+
|
31
|
+
line_no = 0
|
32
|
+
@text.each_line do |line|
|
33
|
+
line_no +=1
|
34
|
+
line = line.strip ## remove leading and trailing whitespace
|
35
|
+
if line.empty?
|
36
|
+
puts "** skipping empty line #{line_no}"
|
37
|
+
next
|
38
|
+
end
|
39
|
+
|
40
|
+
values = line.split( /[ ]{3,}/ ) ## split three or more spaces - use just two ?? why? why not??
|
41
|
+
|
42
|
+
## puts line
|
43
|
+
## pp values
|
44
|
+
recs << values
|
45
|
+
end
|
46
|
+
recs
|
47
|
+
end
|
48
|
+
|
49
|
+
|
50
|
+
end # class TableReader
|
51
|
+
|
52
|
+
end # module Factbook
|
@@ -0,0 +1,85 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
module Factbook
|
4
|
+
module Utils
|
5
|
+
|
6
|
+
########################################
|
7
|
+
## todo: move to textutils - why, why not ?????
|
8
|
+
|
9
|
+
def encode_utf8( text )
|
10
|
+
|
11
|
+
errors = [] ## also return list of encoding errors
|
12
|
+
|
13
|
+
## note: factbook claims utf-8 - but includes invalid bytes in some pages
|
14
|
+
## encoding is likley wester/windows-
|
15
|
+
|
16
|
+
## note:
|
17
|
+
## use � - unknown/invalid unicode char
|
18
|
+
## fix/todo: use ASCII-8BIT instead of binnary
|
19
|
+
text = text.encode('UTF-8', 'binary', :invalid => :replace,
|
20
|
+
:undef => :replace,
|
21
|
+
:replace => '�' )
|
22
|
+
|
23
|
+
## check for replaced/invalid chars and log warrning
|
24
|
+
pos = text.index( '�' )
|
25
|
+
while pos
|
26
|
+
from = pos-10 ## tood/fix: use min/max to check for bounds - why? why not??
|
27
|
+
to = pos+10
|
28
|
+
around = text[from..to]
|
29
|
+
puts " pos #{pos}, from #{from}, to #{to}, around >#{around}<"
|
30
|
+
msg = "invalid char on pos #{pos} around: >#{around}<"
|
31
|
+
puts msg
|
32
|
+
## also log message / w timestamp
|
33
|
+
|
34
|
+
errors << "#{Time.now} - #{msg}"
|
35
|
+
|
36
|
+
pos = text.index( '�', pos+1 )
|
37
|
+
end
|
38
|
+
|
39
|
+
[text,errors] ## return text and errors (list)
|
40
|
+
end
|
41
|
+
|
42
|
+
|
43
|
+
|
44
|
+
def values_to_csv( values )
|
45
|
+
buf = ""
|
46
|
+
values.each_with_index do |value,i|
|
47
|
+
buf << ',' if i > 0 ## add comma (except for first value)
|
48
|
+
## note: allow optional $ sign e.g. $100,000,000
|
49
|
+
## !!!! todo/fix: allow optional minus e.g. -44,000
|
50
|
+
if value =~ /^\$?[1-9][,0-9]+[0-9]$/ ### find a better regex - why? why not??
|
51
|
+
## check if number e.g. 17,098,242 or $17,098,242
|
52
|
+
## remove commas 17098242
|
53
|
+
buf << value.gsub( ',', '' )
|
54
|
+
elsif value.index( ',').nil?
|
55
|
+
## add as is 1:1 (no commana)
|
56
|
+
buf << value
|
57
|
+
else
|
58
|
+
## escape comma with double quote
|
59
|
+
# e.g. Guam, The becomes "Guam, The"
|
60
|
+
buf << '"'
|
61
|
+
buf << value
|
62
|
+
buf << '"'
|
63
|
+
end
|
64
|
+
end
|
65
|
+
buf
|
66
|
+
end
|
67
|
+
|
68
|
+
|
69
|
+
def data_to_csv( recs, headers )
|
70
|
+
text = ""
|
71
|
+
|
72
|
+
text << values_to_csv( headers )
|
73
|
+
text << "\n"
|
74
|
+
|
75
|
+
recs.each do |rec|
|
76
|
+
text << values_to_csv( rec )
|
77
|
+
text << "\n"
|
78
|
+
end
|
79
|
+
|
80
|
+
text
|
81
|
+
end
|
82
|
+
|
83
|
+
|
84
|
+
end # module Utils
|
85
|
+
end # module Factbook
|
@@ -0,0 +1,102 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
module Factbook
|
4
|
+
module Utils
|
5
|
+
|
6
|
+
#######
|
7
|
+
## find meta data (about page info)
|
8
|
+
|
9
|
+
|
10
|
+
#### e.g. Page last updated on September 16, 2015
|
11
|
+
|
12
|
+
MONTH_EN_TO_S={
|
13
|
+
'January' => '1',
|
14
|
+
'February' => '2',
|
15
|
+
'March' => '3',
|
16
|
+
'April' => '4',
|
17
|
+
'May' => '5',
|
18
|
+
'June' => '6',
|
19
|
+
'July' => '7',
|
20
|
+
'August' => '8',
|
21
|
+
'September' => '9',
|
22
|
+
'October' => '10',
|
23
|
+
'November' => '11',
|
24
|
+
'December' => '12'
|
25
|
+
}
|
26
|
+
|
27
|
+
PAGE_LAST_UPDATED_REGEX = /
|
28
|
+
Page \s last \s updated \s on \s
|
29
|
+
(?<month_en>[a-z]+) \s
|
30
|
+
(?<day>\d{1,2}), \s
|
31
|
+
(?<year>\d{4})
|
32
|
+
/imx
|
33
|
+
|
34
|
+
def find_page_last_updated( html )
|
35
|
+
m = PAGE_LAST_UPDATED_REGEX.match( html )
|
36
|
+
if m
|
37
|
+
pp m
|
38
|
+
month_en = m[:month_en]
|
39
|
+
day = m[:day]
|
40
|
+
year = m[:year]
|
41
|
+
puts "** bingo - month #{month_en}, day #{day}, year #{year}"
|
42
|
+
|
43
|
+
month = MONTH_EN_TO_S[ month_en ]
|
44
|
+
date_str = "#{year}-#{month}-#{day}"
|
45
|
+
pp date_str
|
46
|
+
date = Date.strptime( date_str, '%Y-%m-%d' )
|
47
|
+
date
|
48
|
+
else
|
49
|
+
nil
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
##
|
54
|
+
## e.g. regioncode="eur"
|
55
|
+
## countrycode="au"
|
56
|
+
## countryname="Austria"
|
57
|
+
## flagsubfield=""
|
58
|
+
## countryaffiliation=""
|
59
|
+
## flagdescription=""
|
60
|
+
## flagdescriptionnote=""
|
61
|
+
## region="Europe"
|
62
|
+
##
|
63
|
+
## note: countryaffiliation may be empty
|
64
|
+
|
65
|
+
|
66
|
+
|
67
|
+
PAGE_INFO_REGEX = /
|
68
|
+
regioncode=(?<q1>"|')(?<region_code>.+?)\k<q1>
|
69
|
+
\s+
|
70
|
+
countrycode=(?<q2>"|')(?<country_code>.+?)\k<q2> ## is k<3> backref
|
71
|
+
\s+
|
72
|
+
countryname=(?<q3>"|')(?<country>.+?)\k<q3>
|
73
|
+
\s+
|
74
|
+
[^>]+? ## allow any attribs (note: non-greedy)
|
75
|
+
countryaffiliation=(?<q4>"|')(?<affiliation>.*?)\k<q4> ## note: might be empty
|
76
|
+
\s+
|
77
|
+
[^>]+? ## allow any attribs (note: non-greedy)
|
78
|
+
region=(?<q5>"|')(?<region>.+?)\k<q5> ## check world - might be empty ?? or for ocean ??
|
79
|
+
/imx
|
80
|
+
|
81
|
+
|
82
|
+
def find_page_info( html )
|
83
|
+
m = PAGE_INFO_REGEX.match( html )
|
84
|
+
if m
|
85
|
+
pp m
|
86
|
+
|
87
|
+
h = { country_code: m[:country_code],
|
88
|
+
country_name: m[:country],
|
89
|
+
country_affiliation: m[:affiliation],
|
90
|
+
region_code: m[:region_code],
|
91
|
+
region_name: m[:region] }
|
92
|
+
|
93
|
+
puts "** bingo - #{h.inspect}"
|
94
|
+
h ## return hash w/ name-value pairs
|
95
|
+
else
|
96
|
+
nil ## or return empty struct with nils/empty strings - why?? why not??
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
100
|
+
|
101
|
+
end # module Utils
|
102
|
+
end # module Factbook
|
data/lib/factbook/version.rb
CHANGED
data/lib/factbook.rb
CHANGED
@@ -7,6 +7,7 @@ require 'uri'
|
|
7
7
|
require 'cgi'
|
8
8
|
require 'pp'
|
9
9
|
require 'json'
|
10
|
+
require 'csv'
|
10
11
|
require 'fileutils'
|
11
12
|
|
12
13
|
|
@@ -21,11 +22,32 @@ require 'nokogiri'
|
|
21
22
|
# our own code
|
22
23
|
|
23
24
|
require 'factbook/version' # let it always go first
|
25
|
+
require 'factbook/utils'
|
26
|
+
require 'factbook/utils_info'
|
27
|
+
require 'factbook/sanitizer'
|
28
|
+
require 'factbook/builder_item'
|
29
|
+
require 'factbook/builder'
|
24
30
|
require 'factbook/page'
|
25
31
|
require 'factbook/sect'
|
32
|
+
require 'factbook/subsect'
|
26
33
|
|
34
|
+
require 'factbook/codes'
|
35
|
+
require 'factbook/comparisons'
|
27
36
|
|
37
|
+
require 'factbook/table' ## e.g. TableReader
|
28
38
|
|
29
39
|
|
30
|
-
puts Factbook.banner
|
31
40
|
|
41
|
+
module Factbook
|
42
|
+
|
43
|
+
## auto-load builtin codes and comparisons
|
44
|
+
CODES = Codes.from_csv( "#{Factbook.root}/data/codes.csv" )
|
45
|
+
COMPARISONS = Comparisons.from_csv( "#{Factbook.root}/data/comparisons.csv")
|
46
|
+
|
47
|
+
def self.codes() CODES; end
|
48
|
+
def self.comparisons() COMPARISONS; end
|
49
|
+
|
50
|
+
end # module Factbook
|
51
|
+
|
52
|
+
|
53
|
+
puts Factbook.banner if defined?($RUBYLIBS_DEBUG) && $RUBYLIBS_DEBUG
|