factbook 1.2.2 → 2.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/{HISTORY.md → CHANGELOG.md} +3 -3
- data/Manifest.txt +1 -1
- data/README.md +548 -543
- data/Rakefile +34 -33
- data/data/codes.csv +262 -262
- data/data/codesxref.csv +280 -280
- data/lib/factbook.rb +68 -75
- data/lib/factbook/builder.rb +14 -3
- data/lib/factbook/builder_item.rb +93 -59
- data/lib/factbook/page.rb +20 -57
- data/lib/factbook/sanitizer.rb +98 -285
- data/lib/factbook/version.rb +21 -22
- data/script/json.rb +3 -2
- data/test/data/src/au.html +658 -658
- data/test/data/src/be.html +648 -648
- data/test/helper.rb +11 -11
- data/test/test_fields.rb +52 -52
- data/test/test_json.rb +45 -45
- data/test/test_page.rb +38 -38
- metadata +31 -11
data/lib/factbook.rb
CHANGED
@@ -1,75 +1,68 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
require '
|
6
|
-
require '
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
require '
|
11
|
-
|
12
|
-
require '
|
13
|
-
require '
|
14
|
-
|
15
|
-
|
16
|
-
##
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
require '
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
require 'factbook/
|
51
|
-
require 'factbook/
|
52
|
-
require 'factbook/
|
53
|
-
require 'factbook/
|
54
|
-
|
55
|
-
require 'factbook/
|
56
|
-
require 'factbook/
|
57
|
-
|
58
|
-
require 'factbook/
|
59
|
-
|
60
|
-
require 'factbook/
|
61
|
-
|
62
|
-
require 'factbook/
|
63
|
-
require 'factbook/
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
require 'factbook/db/schema' ## database (sql tables) support
|
70
|
-
require 'factbook/db/models'
|
71
|
-
require 'factbook/db/importer'
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
puts Factbook.banner if defined?($RUBYLIBS_DEBUG) && $RUBYLIBS_DEBUG
|
1
|
+
## stdlibs
|
2
|
+
|
3
|
+
|
4
|
+
require 'cgi'
|
5
|
+
require 'csv' ## fix: use csvreader!!!!
|
6
|
+
require 'erb' ## used by Almanac class (for render)
|
7
|
+
|
8
|
+
|
9
|
+
## 3rd party gems/libs
|
10
|
+
## require 'props'
|
11
|
+
|
12
|
+
require 'logutils'
|
13
|
+
require 'webget'
|
14
|
+
require 'nokogiri'
|
15
|
+
|
16
|
+
require 'active_record' ## add activerecord/db support (NOT optional for now)
|
17
|
+
|
18
|
+
|
19
|
+
|
20
|
+
# our own code
|
21
|
+
require 'factbook/version' # let it always go first
|
22
|
+
|
23
|
+
|
24
|
+
require 'factbook/codes'
|
25
|
+
require 'factbook/comparisons'
|
26
|
+
require 'factbook/attributes'
|
27
|
+
|
28
|
+
module Factbook
|
29
|
+
|
30
|
+
## auto-load builtin codes, comparisons, attributes, etc.
|
31
|
+
CODES = Codes.from_csv( "#{Factbook.root}/data/codes.csv" )
|
32
|
+
COMPARISONS = Comparisons.from_csv( "#{Factbook.root}/data/comparisons.csv" )
|
33
|
+
ATTRIBUTES = Attributes.from_yaml( "#{Factbook.root}/data/attributes.yml" )
|
34
|
+
|
35
|
+
def self.codes() CODES; end
|
36
|
+
def self.comparisons() COMPARISONS; end
|
37
|
+
def self.attributes() ATTRIBUTES; end
|
38
|
+
|
39
|
+
end # module Factbook
|
40
|
+
|
41
|
+
## note: make codes, comparisons, attributes available
|
42
|
+
|
43
|
+
require 'factbook/utils'
|
44
|
+
require 'factbook/utils_info'
|
45
|
+
require 'factbook/sanitizer'
|
46
|
+
require 'factbook/normalize'
|
47
|
+
require 'factbook/builder_item'
|
48
|
+
require 'factbook/builder'
|
49
|
+
require 'factbook/builder_json'
|
50
|
+
require 'factbook/page'
|
51
|
+
require 'factbook/page_info'
|
52
|
+
require 'factbook/sect'
|
53
|
+
require 'factbook/subsect'
|
54
|
+
|
55
|
+
require 'factbook/reader_json'
|
56
|
+
require 'factbook/almanac'
|
57
|
+
|
58
|
+
require 'factbook/table' ## e.g. TableReader
|
59
|
+
|
60
|
+
require 'factbook/counter'
|
61
|
+
|
62
|
+
require 'factbook/db/schema' ## database (sql tables) support
|
63
|
+
require 'factbook/db/models'
|
64
|
+
require 'factbook/db/importer'
|
65
|
+
|
66
|
+
|
67
|
+
|
68
|
+
puts Factbook.banner
|
data/lib/factbook/builder.rb
CHANGED
@@ -40,12 +40,23 @@ def initialize( html_ascii )
|
|
40
40
|
## todo/fix: use/assume windows 12xx?? encoding - change encoding to utf-8 (from binary/ascii8bit)
|
41
41
|
@html, @info, @errors = Sanitizer.new.sanitize( @html_ascii )
|
42
42
|
|
43
|
-
@html_debug = map_sects( @html )
|
44
|
-
@html_debug = map_subsects( @html_debug )
|
45
43
|
|
46
|
-
html_sects =
|
44
|
+
html_sects = if @html.empty?
|
45
|
+
## note: support "empty" pages - old format waiting for update!!!
|
46
|
+
## cannot parse for now
|
47
|
+
[] ## return empty (no) sections for now - sorry (its just one page with code cc anyway!!)
|
48
|
+
else
|
49
|
+
@html_debug = map_sects( @html )
|
50
|
+
@html_debug = map_subsects( @html_debug )
|
51
|
+
|
52
|
+
split_sects( @html_debug )
|
53
|
+
end
|
54
|
+
|
47
55
|
pp html_sects
|
48
56
|
|
57
|
+
## debug
|
58
|
+
## File.open( 'tmp/br.debug.html', 'w:utf-8') { |f| f.write( @html_debug ) }
|
59
|
+
|
49
60
|
|
50
61
|
@sects = []
|
51
62
|
html_sects.each do |html_sect|
|
@@ -5,88 +5,122 @@ module Factbook
|
|
5
5
|
class ItemBuilder ## renameto ItemReader, ItemParser - why? why not??
|
6
6
|
include LogUtils::Logging
|
7
7
|
include NormalizeHelper ## e.g. normalize_category
|
8
|
-
|
8
|
+
|
9
9
|
def initialize( html, name )
|
10
10
|
@html = html
|
11
11
|
@name = name # add category/field name e.g. Area, Location, etc.
|
12
12
|
end
|
13
|
-
|
13
|
+
|
14
|
+
|
15
|
+
|
16
|
+
##
|
17
|
+
## <div class="category_data subfield text">
|
18
|
+
## Portuguese (official and most widely spoken language)
|
19
|
+
##
|
20
|
+
## </div>
|
21
|
+
## <div class="category_data note">
|
22
|
+
## <p><strong>note:</strong> less common languages include Spanish (border areas and schools), German, Italian, Japanese, English, and a large number of minor Amerindian languages</p>
|
23
|
+
## </div>
|
24
|
+
|
25
|
+
|
14
26
|
def read
|
15
27
|
## return hash from html snippet
|
16
28
|
doc = Nokogiri::HTML.fragment( @html )
|
17
29
|
|
18
30
|
data = {}
|
19
|
-
last_node = nil ## track last hash (always use text key)
|
20
|
-
last_node_data_count = 0
|
21
31
|
|
22
32
|
## note:
|
23
33
|
## skip whitespace text nodes (e.g. \n\n etc); just use divs
|
24
|
-
doc.children.filter('div')
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
else ### possible ??? if data_count is zero - not should not include any data
|
42
|
-
## todo: issue warning here - why? why not??
|
43
|
-
last_node['text'] += " #{text}" ## append w/o separator
|
44
|
-
end
|
45
|
-
else
|
46
|
-
if @name == 'Demographic profile' ## special case (use space a sep)
|
47
|
-
last_node['text'] += " #{text}" ## append without (w/o) separator
|
34
|
+
doc_children = doc.children.filter('div')
|
35
|
+
|
36
|
+
puts " parsing >#{@name}< - #{doc_children.size} category_data divs(s):"
|
37
|
+
|
38
|
+
doc_children.each_with_index do |div,i|
|
39
|
+
if div['class'].index( 'note' )
|
40
|
+
text = squish( div.text.strip )
|
41
|
+
puts "category_data: >#{text}<"
|
42
|
+
|
43
|
+
data['note'] = { 'text' => text }
|
44
|
+
elsif div['class'].index( 'historic' )
|
45
|
+
## add all historic together into one for now
|
46
|
+
text = squish( div.text.strip )
|
47
|
+
puts "category_data: >#{text}<"
|
48
|
+
|
49
|
+
if i == 0
|
50
|
+
data['text'] = text
|
48
51
|
else
|
49
|
-
|
52
|
+
## append with / for now
|
53
|
+
data['text'] += " / #{text}"
|
50
54
|
end
|
51
|
-
|
52
|
-
|
55
|
+
elsif div.css( 'span.subfield-name').empty?
|
56
|
+
## assume "implied text field"
|
57
|
+
## check for index == 1 / child count == 1 - why? why not
|
58
|
+
text = squish( div.text.strip ) ## fix/todo: use strip
|
59
|
+
puts "category_data: >#{text}<"
|
53
60
|
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
if
|
58
|
-
puts "
|
61
|
+
data['text'] = text
|
62
|
+
|
63
|
+
## must be always first node for now
|
64
|
+
if i != 0
|
65
|
+
puts "!! ERROR - 'implied' category W/O name NOT first div / node:"
|
66
|
+
puts @html
|
67
|
+
exit 1
|
59
68
|
end
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
69
|
+
elsif div['class'].index( 'grouped_subfield' )
|
70
|
+
## split grouped subfield!!
|
71
|
+
## <span class="subfield-name">arable land:</span>
|
72
|
+
## <span class="subfield-number">8.6%</span>
|
73
|
+
## <span class="subfield-date">(2011 est.)</span>
|
74
|
+
## /
|
75
|
+
## <span class="subfield-name">permanent crops:</span>
|
76
|
+
## <span class="subfield-number">0.8%</span>
|
77
|
+
## <span class="subfield-date">(2011 est.)</span>
|
78
|
+
## /
|
79
|
+
## <span class="subfield-name">permanent pasture:</span>
|
80
|
+
## <span class="subfield-number">23.5%</span>
|
81
|
+
## <span class="subfield-date">(2011 est.)</span>
|
82
|
+
|
83
|
+
## join names for now - why? why not?
|
84
|
+
## e.g. becomes:
|
85
|
+
## arable land / permanent crops / permanent pasture: for key ??
|
86
|
+
span_names = div.css( 'span.subfield-name')
|
87
|
+
keys = []
|
88
|
+
span_names.each do |span|
|
89
|
+
keys << normalize_category( span.text.strip )
|
90
|
+
span.replace( '' )
|
91
|
+
end
|
92
|
+
key = keys.join( ' / ')
|
93
|
+
text = squish( div.text.strip )
|
94
|
+
puts "category_data key >#{key}<: >#{text}<"
|
95
|
+
data[ key ] = { 'text' => text }
|
76
96
|
else
|
77
|
-
|
97
|
+
## get subfield name
|
98
|
+
span_names = div.css( 'span.subfield-name')
|
99
|
+
if span_names.size > 1
|
100
|
+
puts "!! ERROR - found more than one subfield-name:"
|
101
|
+
puts div.to_html
|
102
|
+
exit 1
|
103
|
+
end
|
104
|
+
key = normalize_category( span_names[0].text.strip )
|
105
|
+
span_names[0].replace( '' )
|
106
|
+
|
107
|
+
text = squish( div.text.strip )
|
108
|
+
puts "category_data key >#{key}<: >#{text}<"
|
109
|
+
data[ key ] = { 'text' => text }
|
78
110
|
end
|
79
|
-
|
80
|
-
## pp child
|
81
|
-
## css = child['class']
|
82
|
-
## puts "[#{i}] #{child.name} class='>#{css}< : #{css.class.name}' >#{child.text}<"
|
83
111
|
end
|
84
|
-
|
112
|
+
|
113
|
+
|
85
114
|
pp data
|
86
115
|
data
|
87
116
|
end
|
88
117
|
|
89
|
-
|
118
|
+
|
119
|
+
|
120
|
+
def squish( str )
|
121
|
+
str.gsub( /[ \t\n\r]{2,}/, ' ') ## replace multi-spaces (incl. newlines with once space)
|
122
|
+
end
|
123
|
+
|
90
124
|
end # class ItemBuilder
|
91
125
|
|
92
126
|
end # module Factbook
|
data/lib/factbook/page.rb
CHANGED
@@ -1,4 +1,3 @@
|
|
1
|
-
# encoding: utf-8
|
2
1
|
|
3
2
|
module Factbook
|
4
3
|
|
@@ -38,10 +37,10 @@ class Page
|
|
38
37
|
|
39
38
|
def initialize( code, opts={} )
|
40
39
|
### keep code - why? why not?? (use page_info/info e.g. info.country_code??)
|
41
|
-
|
40
|
+
|
42
41
|
if opts[:json]
|
43
42
|
json = opts[:json] ## note: json is (still) a string/text (NOT yet parsed to structured data)
|
44
|
-
b = JsonBuilder.from_string( json )
|
43
|
+
b = JsonBuilder.from_string( json )
|
45
44
|
else ## assume html
|
46
45
|
if opts[:html] ## note: expects ASCII-7BIT/BINARY encoding
|
47
46
|
## for debugging and testing allow "custom" passed-in html page
|
@@ -49,11 +48,13 @@ class Page
|
|
49
48
|
else
|
50
49
|
url_string = SITE_BASE.gsub( '{code}', code )
|
51
50
|
## note: expects ASCII-7BIT/BINARY encoding
|
52
|
-
|
53
|
-
|
51
|
+
|
52
|
+
## html = fetch_page( url_string ) ## use PageFetcher class - why?? why not??
|
53
|
+
html = Webcache.read( url_string )
|
54
|
+
end
|
54
55
|
b = Builder.from_string( html )
|
55
56
|
end
|
56
|
-
|
57
|
+
|
57
58
|
@sects = b.sects
|
58
59
|
@info = b.info
|
59
60
|
|
@@ -65,7 +66,7 @@ class Page
|
|
65
66
|
@info = info
|
66
67
|
end
|
67
68
|
|
68
|
-
@data = {}
|
69
|
+
@data = {}
|
69
70
|
@sects.each do |sect|
|
70
71
|
@data[ sect.title ] = sect.data
|
71
72
|
end
|
@@ -78,7 +79,7 @@ class Page
|
|
78
79
|
if opts[:minify]
|
79
80
|
data.to_json
|
80
81
|
else
|
81
|
-
## was: -- opts[:pretty] || opts[:pp]
|
82
|
+
## was: -- opts[:pretty] || opts[:pp]
|
82
83
|
JSON.pretty_generate( data ) ## note: pretty print by default!
|
83
84
|
end
|
84
85
|
end
|
@@ -96,10 +97,10 @@ class Page
|
|
96
97
|
end
|
97
98
|
|
98
99
|
## add convenience (shortcut) accessors / attributes / fields / getters
|
99
|
-
|
100
|
+
|
100
101
|
ATTRIBUTES.each do |attrib|
|
101
102
|
## e.g.
|
102
|
-
## def background() data['Introduction']['Background']['text']; end
|
103
|
+
## def background() data['Introduction']['Background']['text']; end
|
103
104
|
## def location() data['Geography']['Location']['text']; end
|
104
105
|
## etc.
|
105
106
|
if attrib.path.size == 1
|
@@ -114,31 +115,18 @@ class Page
|
|
114
115
|
fetch( attrib.path[1], {} )['text']
|
115
116
|
end
|
116
117
|
end
|
117
|
-
end
|
118
|
+
end
|
118
119
|
|
119
120
|
|
120
121
|
private
|
121
|
-
def fetch_page(
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
# NB: Net::HTTP will NOT set encoding UTF-8 etc.
|
130
|
-
# will mostly be ASCII
|
131
|
-
# - try to change encoding to UTF-8 ourselves
|
132
|
-
logger.debug "t.encoding.name (before): #{t.encoding.name}"
|
133
|
-
#####
|
134
|
-
# NB: ASCII-8BIT == BINARY == Encoding Unknown; Raw Bytes Here
|
135
|
-
t
|
136
|
-
else
|
137
|
-
logger.error "fetch HTTP - #{response.code} #{response.message}"
|
138
|
-
## todo/fix: raise http exception (see fetcher) -- why? why not??
|
139
|
-
fail "fetch HTTP - #{response.code} #{response.message}"
|
140
|
-
nil
|
141
|
-
end
|
122
|
+
def fetch_page( url )
|
123
|
+
response = Webget.page( url )
|
124
|
+
|
125
|
+
## note: exit on get / fetch error - do NOT continue for now - why? why not?
|
126
|
+
exit 1 if response.status.nok? ## e.g. HTTP status code != 200
|
127
|
+
|
128
|
+
|
129
|
+
response.text
|
142
130
|
end
|
143
131
|
|
144
132
|
|
@@ -157,29 +145,4 @@ end
|
|
157
145
|
|
158
146
|
|
159
147
|
end # class Page
|
160
|
-
|
161
|
-
|
162
|
-
=begin
|
163
|
-
class PageFetcher
|
164
|
-
|
165
|
-
def fetch( cc )
|
166
|
-
worker = Fetcher::Worker.new
|
167
|
-
factbook_base = 'https://www.cia.gov/library/publications/the-world-factbook/geos'
|
168
|
-
|
169
|
-
res = worker.get_response( "#{factbook_base}/#{cc}.html" )
|
170
|
-
|
171
|
-
# on error throw exception - why? why not??
|
172
|
-
if res.code != '200'
|
173
|
-
raise Fetcher::HttpError.new( res.code, res.message )
|
174
|
-
end
|
175
|
-
|
176
|
-
###
|
177
|
-
# Note: Net::HTTP will NOT set encoding UTF-8 etc.
|
178
|
-
# will be set to ASCII-8BIT == BINARY == Encoding Unknown; Raw Bytes Here
|
179
|
-
html = res.body.to_s
|
180
|
-
end
|
181
|
-
end # PageFetcher
|
182
|
-
=end
|
183
|
-
|
184
|
-
|
185
148
|
end # module Factbook
|