factbook 1.2.2 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/{HISTORY.md → CHANGELOG.md} +3 -3
- data/Manifest.txt +1 -1
- data/README.md +548 -543
- data/Rakefile +34 -33
- data/data/codes.csv +262 -262
- data/data/codesxref.csv +280 -280
- data/lib/factbook.rb +68 -75
- data/lib/factbook/builder.rb +14 -3
- data/lib/factbook/builder_item.rb +93 -59
- data/lib/factbook/page.rb +20 -57
- data/lib/factbook/sanitizer.rb +98 -285
- data/lib/factbook/version.rb +21 -22
- data/script/json.rb +3 -2
- data/test/data/src/au.html +658 -658
- data/test/data/src/be.html +648 -648
- data/test/helper.rb +11 -11
- data/test/test_fields.rb +52 -52
- data/test/test_json.rb +45 -45
- data/test/test_page.rb +38 -38
- metadata +31 -11
data/lib/factbook.rb
CHANGED
@@ -1,75 +1,68 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
require '
|
6
|
-
require '
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
require '
|
11
|
-
|
12
|
-
require '
|
13
|
-
require '
|
14
|
-
|
15
|
-
|
16
|
-
##
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
require '
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
require 'factbook/
|
51
|
-
require 'factbook/
|
52
|
-
require 'factbook/
|
53
|
-
require 'factbook/
|
54
|
-
|
55
|
-
require 'factbook/
|
56
|
-
require 'factbook/
|
57
|
-
|
58
|
-
require 'factbook/
|
59
|
-
|
60
|
-
require 'factbook/
|
61
|
-
|
62
|
-
require 'factbook/
|
63
|
-
require 'factbook/
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
require 'factbook/db/schema' ## database (sql tables) support
|
70
|
-
require 'factbook/db/models'
|
71
|
-
require 'factbook/db/importer'
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
puts Factbook.banner if defined?($RUBYLIBS_DEBUG) && $RUBYLIBS_DEBUG
|
1
|
+
## stdlibs
|
2
|
+
|
3
|
+
|
4
|
+
require 'cgi'
|
5
|
+
require 'csv' ## fix: use csvreader!!!!
|
6
|
+
require 'erb' ## used by Almanac class (for render)
|
7
|
+
|
8
|
+
|
9
|
+
## 3rd party gems/libs
|
10
|
+
## require 'props'
|
11
|
+
|
12
|
+
require 'logutils'
|
13
|
+
require 'webget'
|
14
|
+
require 'nokogiri'
|
15
|
+
|
16
|
+
require 'active_record' ## add activerecord/db support (NOT optional for now)
|
17
|
+
|
18
|
+
|
19
|
+
|
20
|
+
# our own code
|
21
|
+
require 'factbook/version' # let it always go first
|
22
|
+
|
23
|
+
|
24
|
+
require 'factbook/codes'
|
25
|
+
require 'factbook/comparisons'
|
26
|
+
require 'factbook/attributes'
|
27
|
+
|
28
|
+
module Factbook
|
29
|
+
|
30
|
+
## auto-load builtin codes, comparisons, attributes, etc.
|
31
|
+
CODES = Codes.from_csv( "#{Factbook.root}/data/codes.csv" )
|
32
|
+
COMPARISONS = Comparisons.from_csv( "#{Factbook.root}/data/comparisons.csv" )
|
33
|
+
ATTRIBUTES = Attributes.from_yaml( "#{Factbook.root}/data/attributes.yml" )
|
34
|
+
|
35
|
+
def self.codes() CODES; end
|
36
|
+
def self.comparisons() COMPARISONS; end
|
37
|
+
def self.attributes() ATTRIBUTES; end
|
38
|
+
|
39
|
+
end # module Factbook
|
40
|
+
|
41
|
+
## note: make codes, comparisons, attributes available
|
42
|
+
|
43
|
+
require 'factbook/utils'
|
44
|
+
require 'factbook/utils_info'
|
45
|
+
require 'factbook/sanitizer'
|
46
|
+
require 'factbook/normalize'
|
47
|
+
require 'factbook/builder_item'
|
48
|
+
require 'factbook/builder'
|
49
|
+
require 'factbook/builder_json'
|
50
|
+
require 'factbook/page'
|
51
|
+
require 'factbook/page_info'
|
52
|
+
require 'factbook/sect'
|
53
|
+
require 'factbook/subsect'
|
54
|
+
|
55
|
+
require 'factbook/reader_json'
|
56
|
+
require 'factbook/almanac'
|
57
|
+
|
58
|
+
require 'factbook/table' ## e.g. TableReader
|
59
|
+
|
60
|
+
require 'factbook/counter'
|
61
|
+
|
62
|
+
require 'factbook/db/schema' ## database (sql tables) support
|
63
|
+
require 'factbook/db/models'
|
64
|
+
require 'factbook/db/importer'
|
65
|
+
|
66
|
+
|
67
|
+
|
68
|
+
puts Factbook.banner
|
data/lib/factbook/builder.rb
CHANGED
@@ -40,12 +40,23 @@ def initialize( html_ascii )
|
|
40
40
|
## todo/fix: use/assume windows 12xx?? encoding - change encoding to utf-8 (from binary/ascii8bit)
|
41
41
|
@html, @info, @errors = Sanitizer.new.sanitize( @html_ascii )
|
42
42
|
|
43
|
-
@html_debug = map_sects( @html )
|
44
|
-
@html_debug = map_subsects( @html_debug )
|
45
43
|
|
46
|
-
html_sects =
|
44
|
+
html_sects = if @html.empty?
|
45
|
+
## note: support "empty" pages - old format waiting for update!!!
|
46
|
+
## cannot parse for now
|
47
|
+
[] ## return empty (no) sections for now - sorry (its just one page with code cc anyway!!)
|
48
|
+
else
|
49
|
+
@html_debug = map_sects( @html )
|
50
|
+
@html_debug = map_subsects( @html_debug )
|
51
|
+
|
52
|
+
split_sects( @html_debug )
|
53
|
+
end
|
54
|
+
|
47
55
|
pp html_sects
|
48
56
|
|
57
|
+
## debug
|
58
|
+
## File.open( 'tmp/br.debug.html', 'w:utf-8') { |f| f.write( @html_debug ) }
|
59
|
+
|
49
60
|
|
50
61
|
@sects = []
|
51
62
|
html_sects.each do |html_sect|
|
@@ -5,88 +5,122 @@ module Factbook
|
|
5
5
|
class ItemBuilder ## renameto ItemReader, ItemParser - why? why not??
|
6
6
|
include LogUtils::Logging
|
7
7
|
include NormalizeHelper ## e.g. normalize_category
|
8
|
-
|
8
|
+
|
9
9
|
def initialize( html, name )
|
10
10
|
@html = html
|
11
11
|
@name = name # add category/field name e.g. Area, Location, etc.
|
12
12
|
end
|
13
|
-
|
13
|
+
|
14
|
+
|
15
|
+
|
16
|
+
##
|
17
|
+
## <div class="category_data subfield text">
|
18
|
+
## Portuguese (official and most widely spoken language)
|
19
|
+
##
|
20
|
+
## </div>
|
21
|
+
## <div class="category_data note">
|
22
|
+
## <p><strong>note:</strong> less common languages include Spanish (border areas and schools), German, Italian, Japanese, English, and a large number of minor Amerindian languages</p>
|
23
|
+
## </div>
|
24
|
+
|
25
|
+
|
14
26
|
def read
|
15
27
|
## return hash from html snippet
|
16
28
|
doc = Nokogiri::HTML.fragment( @html )
|
17
29
|
|
18
30
|
data = {}
|
19
|
-
last_node = nil ## track last hash (always use text key)
|
20
|
-
last_node_data_count = 0
|
21
31
|
|
22
32
|
## note:
|
23
33
|
## skip whitespace text nodes (e.g. \n\n etc); just use divs
|
24
|
-
doc.children.filter('div')
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
else ### possible ??? if data_count is zero - not should not include any data
|
42
|
-
## todo: issue warning here - why? why not??
|
43
|
-
last_node['text'] += " #{text}" ## append w/o separator
|
44
|
-
end
|
45
|
-
else
|
46
|
-
if @name == 'Demographic profile' ## special case (use space a sep)
|
47
|
-
last_node['text'] += " #{text}" ## append without (w/o) separator
|
34
|
+
doc_children = doc.children.filter('div')
|
35
|
+
|
36
|
+
puts " parsing >#{@name}< - #{doc_children.size} category_data divs(s):"
|
37
|
+
|
38
|
+
doc_children.each_with_index do |div,i|
|
39
|
+
if div['class'].index( 'note' )
|
40
|
+
text = squish( div.text.strip )
|
41
|
+
puts "category_data: >#{text}<"
|
42
|
+
|
43
|
+
data['note'] = { 'text' => text }
|
44
|
+
elsif div['class'].index( 'historic' )
|
45
|
+
## add all historic together into one for now
|
46
|
+
text = squish( div.text.strip )
|
47
|
+
puts "category_data: >#{text}<"
|
48
|
+
|
49
|
+
if i == 0
|
50
|
+
data['text'] = text
|
48
51
|
else
|
49
|
-
|
52
|
+
## append with / for now
|
53
|
+
data['text'] += " / #{text}"
|
50
54
|
end
|
51
|
-
|
52
|
-
|
55
|
+
elsif div.css( 'span.subfield-name').empty?
|
56
|
+
## assume "implied text field"
|
57
|
+
## check for index == 1 / child count == 1 - why? why not
|
58
|
+
text = squish( div.text.strip ) ## fix/todo: use strip
|
59
|
+
puts "category_data: >#{text}<"
|
53
60
|
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
if
|
58
|
-
puts "
|
61
|
+
data['text'] = text
|
62
|
+
|
63
|
+
## must be always first node for now
|
64
|
+
if i != 0
|
65
|
+
puts "!! ERROR - 'implied' category W/O name NOT first div / node:"
|
66
|
+
puts @html
|
67
|
+
exit 1
|
59
68
|
end
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
69
|
+
elsif div['class'].index( 'grouped_subfield' )
|
70
|
+
## split grouped subfield!!
|
71
|
+
## <span class="subfield-name">arable land:</span>
|
72
|
+
## <span class="subfield-number">8.6%</span>
|
73
|
+
## <span class="subfield-date">(2011 est.)</span>
|
74
|
+
## /
|
75
|
+
## <span class="subfield-name">permanent crops:</span>
|
76
|
+
## <span class="subfield-number">0.8%</span>
|
77
|
+
## <span class="subfield-date">(2011 est.)</span>
|
78
|
+
## /
|
79
|
+
## <span class="subfield-name">permanent pasture:</span>
|
80
|
+
## <span class="subfield-number">23.5%</span>
|
81
|
+
## <span class="subfield-date">(2011 est.)</span>
|
82
|
+
|
83
|
+
## join names for now - why? why not?
|
84
|
+
## e.g. becomes:
|
85
|
+
## arable land / permanent crops / permanent pasture: for key ??
|
86
|
+
span_names = div.css( 'span.subfield-name')
|
87
|
+
keys = []
|
88
|
+
span_names.each do |span|
|
89
|
+
keys << normalize_category( span.text.strip )
|
90
|
+
span.replace( '' )
|
91
|
+
end
|
92
|
+
key = keys.join( ' / ')
|
93
|
+
text = squish( div.text.strip )
|
94
|
+
puts "category_data key >#{key}<: >#{text}<"
|
95
|
+
data[ key ] = { 'text' => text }
|
76
96
|
else
|
77
|
-
|
97
|
+
## get subfield name
|
98
|
+
span_names = div.css( 'span.subfield-name')
|
99
|
+
if span_names.size > 1
|
100
|
+
puts "!! ERROR - found more than one subfield-name:"
|
101
|
+
puts div.to_html
|
102
|
+
exit 1
|
103
|
+
end
|
104
|
+
key = normalize_category( span_names[0].text.strip )
|
105
|
+
span_names[0].replace( '' )
|
106
|
+
|
107
|
+
text = squish( div.text.strip )
|
108
|
+
puts "category_data key >#{key}<: >#{text}<"
|
109
|
+
data[ key ] = { 'text' => text }
|
78
110
|
end
|
79
|
-
|
80
|
-
## pp child
|
81
|
-
## css = child['class']
|
82
|
-
## puts "[#{i}] #{child.name} class='>#{css}< : #{css.class.name}' >#{child.text}<"
|
83
111
|
end
|
84
|
-
|
112
|
+
|
113
|
+
|
85
114
|
pp data
|
86
115
|
data
|
87
116
|
end
|
88
117
|
|
89
|
-
|
118
|
+
|
119
|
+
|
120
|
+
def squish( str )
|
121
|
+
str.gsub( /[ \t\n\r]{2,}/, ' ') ## replace multi-spaces (incl. newlines with once space)
|
122
|
+
end
|
123
|
+
|
90
124
|
end # class ItemBuilder
|
91
125
|
|
92
126
|
end # module Factbook
|
data/lib/factbook/page.rb
CHANGED
@@ -1,4 +1,3 @@
|
|
1
|
-
# encoding: utf-8
|
2
1
|
|
3
2
|
module Factbook
|
4
3
|
|
@@ -38,10 +37,10 @@ class Page
|
|
38
37
|
|
39
38
|
def initialize( code, opts={} )
|
40
39
|
### keep code - why? why not?? (use page_info/info e.g. info.country_code??)
|
41
|
-
|
40
|
+
|
42
41
|
if opts[:json]
|
43
42
|
json = opts[:json] ## note: json is (still) a string/text (NOT yet parsed to structured data)
|
44
|
-
b = JsonBuilder.from_string( json )
|
43
|
+
b = JsonBuilder.from_string( json )
|
45
44
|
else ## assume html
|
46
45
|
if opts[:html] ## note: expects ASCII-7BIT/BINARY encoding
|
47
46
|
## for debugging and testing allow "custom" passed-in html page
|
@@ -49,11 +48,13 @@ class Page
|
|
49
48
|
else
|
50
49
|
url_string = SITE_BASE.gsub( '{code}', code )
|
51
50
|
## note: expects ASCII-7BIT/BINARY encoding
|
52
|
-
|
53
|
-
|
51
|
+
|
52
|
+
## html = fetch_page( url_string ) ## use PageFetcher class - why?? why not??
|
53
|
+
html = Webcache.read( url_string )
|
54
|
+
end
|
54
55
|
b = Builder.from_string( html )
|
55
56
|
end
|
56
|
-
|
57
|
+
|
57
58
|
@sects = b.sects
|
58
59
|
@info = b.info
|
59
60
|
|
@@ -65,7 +66,7 @@ class Page
|
|
65
66
|
@info = info
|
66
67
|
end
|
67
68
|
|
68
|
-
@data = {}
|
69
|
+
@data = {}
|
69
70
|
@sects.each do |sect|
|
70
71
|
@data[ sect.title ] = sect.data
|
71
72
|
end
|
@@ -78,7 +79,7 @@ class Page
|
|
78
79
|
if opts[:minify]
|
79
80
|
data.to_json
|
80
81
|
else
|
81
|
-
## was: -- opts[:pretty] || opts[:pp]
|
82
|
+
## was: -- opts[:pretty] || opts[:pp]
|
82
83
|
JSON.pretty_generate( data ) ## note: pretty print by default!
|
83
84
|
end
|
84
85
|
end
|
@@ -96,10 +97,10 @@ class Page
|
|
96
97
|
end
|
97
98
|
|
98
99
|
## add convenience (shortcut) accessors / attributes / fields / getters
|
99
|
-
|
100
|
+
|
100
101
|
ATTRIBUTES.each do |attrib|
|
101
102
|
## e.g.
|
102
|
-
## def background() data['Introduction']['Background']['text']; end
|
103
|
+
## def background() data['Introduction']['Background']['text']; end
|
103
104
|
## def location() data['Geography']['Location']['text']; end
|
104
105
|
## etc.
|
105
106
|
if attrib.path.size == 1
|
@@ -114,31 +115,18 @@ class Page
|
|
114
115
|
fetch( attrib.path[1], {} )['text']
|
115
116
|
end
|
116
117
|
end
|
117
|
-
end
|
118
|
+
end
|
118
119
|
|
119
120
|
|
120
121
|
private
|
121
|
-
def fetch_page(
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
# NB: Net::HTTP will NOT set encoding UTF-8 etc.
|
130
|
-
# will mostly be ASCII
|
131
|
-
# - try to change encoding to UTF-8 ourselves
|
132
|
-
logger.debug "t.encoding.name (before): #{t.encoding.name}"
|
133
|
-
#####
|
134
|
-
# NB: ASCII-8BIT == BINARY == Encoding Unknown; Raw Bytes Here
|
135
|
-
t
|
136
|
-
else
|
137
|
-
logger.error "fetch HTTP - #{response.code} #{response.message}"
|
138
|
-
## todo/fix: raise http exception (see fetcher) -- why? why not??
|
139
|
-
fail "fetch HTTP - #{response.code} #{response.message}"
|
140
|
-
nil
|
141
|
-
end
|
122
|
+
def fetch_page( url )
|
123
|
+
response = Webget.page( url )
|
124
|
+
|
125
|
+
## note: exit on get / fetch error - do NOT continue for now - why? why not?
|
126
|
+
exit 1 if response.status.nok? ## e.g. HTTP status code != 200
|
127
|
+
|
128
|
+
|
129
|
+
response.text
|
142
130
|
end
|
143
131
|
|
144
132
|
|
@@ -157,29 +145,4 @@ end
|
|
157
145
|
|
158
146
|
|
159
147
|
end # class Page
|
160
|
-
|
161
|
-
|
162
|
-
=begin
|
163
|
-
class PageFetcher
|
164
|
-
|
165
|
-
def fetch( cc )
|
166
|
-
worker = Fetcher::Worker.new
|
167
|
-
factbook_base = 'https://www.cia.gov/library/publications/the-world-factbook/geos'
|
168
|
-
|
169
|
-
res = worker.get_response( "#{factbook_base}/#{cc}.html" )
|
170
|
-
|
171
|
-
# on error throw exception - why? why not??
|
172
|
-
if res.code != '200'
|
173
|
-
raise Fetcher::HttpError.new( res.code, res.message )
|
174
|
-
end
|
175
|
-
|
176
|
-
###
|
177
|
-
# Note: Net::HTTP will NOT set encoding UTF-8 etc.
|
178
|
-
# will be set to ASCII-8BIT == BINARY == Encoding Unknown; Raw Bytes Here
|
179
|
-
html = res.body.to_s
|
180
|
-
end
|
181
|
-
end # PageFetcher
|
182
|
-
=end
|
183
|
-
|
184
|
-
|
185
148
|
end # module Factbook
|