factbook 1.1.0 → 1.1.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Manifest.txt +18 -0
- data/README.md +7 -0
- data/data/attributes.yml +337 -0
- data/data/categories.csv +1 -1
- data/lib/factbook.rb +29 -14
- data/lib/factbook/almanac.rb +72 -0
- data/lib/factbook/attributes.rb +74 -0
- data/lib/factbook/builder.rb +2 -2
- data/lib/factbook/builder_item.rb +7 -8
- data/lib/factbook/builder_json.rb +79 -0
- data/lib/factbook/counter.rb +48 -0
- data/lib/factbook/normalize.rb +43 -0
- data/lib/factbook/page.rb +37 -45
- data/lib/factbook/page_info.rb +12 -0
- data/lib/factbook/reader_json.rb +51 -0
- data/lib/factbook/sanitizer.rb +0 -7
- data/lib/factbook/version.rb +1 -1
- data/script/almanac.rb +48 -0
- data/script/attributes.rb +34 -0
- data/script/build.rb +28 -0
- data/script/counter.rb +145 -0
- data/script/json.rb +18 -0
- data/test/data/json/au.json +892 -0
- data/test/test_attribs.rb +33 -2
- data/test/test_attribs_def.rb +20 -0
- data/test/test_counter.rb +31 -0
- data/test/test_json_builder.rb +25 -0
- data/test/test_normalize.rb +23 -0
- metadata +20 -2
@@ -0,0 +1,72 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
module Factbook
|
4
|
+
|
5
|
+
|
6
|
+
class Almanac
|
7
|
+
|
8
|
+
## convenience helper ("factory")
|
9
|
+
def self.from_json( codes, json_dir: '.' )
|
10
|
+
pages = JsonPageReader.new( json_dir ).read_pages( codes )
|
11
|
+
self.new( pages )
|
12
|
+
end
|
13
|
+
|
14
|
+
|
15
|
+
def initialize( pages )
|
16
|
+
@pages = pages
|
17
|
+
end
|
18
|
+
|
19
|
+
def render( template )
|
20
|
+
buf = ''
|
21
|
+
@pages.each do |page|
|
22
|
+
text = PageCtx.new( page, template ).render
|
23
|
+
|
24
|
+
puts text ## for debugging write country profile to console (too)
|
25
|
+
buf << text
|
26
|
+
end
|
27
|
+
puts "count: #{@pages.count}"
|
28
|
+
buf ## return buffered almanac text
|
29
|
+
end
|
30
|
+
|
31
|
+
|
32
|
+
class PageCtx
|
33
|
+
attr_accessor :page
|
34
|
+
|
35
|
+
def initialize(page, template)
|
36
|
+
@page = page
|
37
|
+
@template = template
|
38
|
+
end
|
39
|
+
|
40
|
+
##############################
|
41
|
+
## add some "view helpers"
|
42
|
+
|
43
|
+
def name
|
44
|
+
## -- calculate name (use long name if (short) name is not availabe e.g. none)
|
45
|
+
## e.g. Austria
|
46
|
+
if @name.nil?
|
47
|
+
@name = page.name
|
48
|
+
@name = page.name_long if @name == 'none'
|
49
|
+
end
|
50
|
+
@name
|
51
|
+
end
|
52
|
+
|
53
|
+
def names( separator: ' • ' )
|
54
|
+
## e.g. Austria • Österreich
|
55
|
+
if @names.nil?
|
56
|
+
if page.name_local.blank? || page.name_local == 'none' || page.name_local == name
|
57
|
+
@names = [name] ## no local (in its own non-english language) name
|
58
|
+
else
|
59
|
+
@names = [name, page.name_local]
|
60
|
+
end
|
61
|
+
end
|
62
|
+
@names.join( separator )
|
63
|
+
end
|
64
|
+
|
65
|
+
def render
|
66
|
+
ERB.new( @template).result( binding )
|
67
|
+
end
|
68
|
+
end ## PageCtx
|
69
|
+
|
70
|
+
end ## Almanac
|
71
|
+
|
72
|
+
end # module Factbook
|
@@ -0,0 +1,74 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
|
4
|
+
module Factbook
|
5
|
+
|
6
|
+
class Attributes
|
7
|
+
|
8
|
+
Attribute = Struct.new( :name,
|
9
|
+
:category, ## e.g. Introduction, Geography, etc.
|
10
|
+
:path, ## note: is an array e.g. ["Area - comparative"] or ["Area", "land"] etc.
|
11
|
+
)
|
12
|
+
|
13
|
+
def self.from_yaml( path )
|
14
|
+
|
15
|
+
h = YAML.load_file( path )
|
16
|
+
pp h
|
17
|
+
|
18
|
+
attribs = []
|
19
|
+
|
20
|
+
## note: use a copy (e.g. h.dup) for now (hash gets changed by build_attribs!!)
|
21
|
+
new_h = h.dup
|
22
|
+
new_h.each do |k,v|
|
23
|
+
category = k
|
24
|
+
build_attribs( attribs, category, [], v )
|
25
|
+
end
|
26
|
+
|
27
|
+
self.new( attribs )
|
28
|
+
end
|
29
|
+
|
30
|
+
|
31
|
+
def self.build_attribs( attribs, category, path, h )
|
32
|
+
|
33
|
+
## assume it's an attribute definition hash
|
34
|
+
## note: !! exclude special cases:
|
35
|
+
## Capital -- incl. name key itself
|
36
|
+
## National anthem
|
37
|
+
if h.has_key?( 'name' ) && ['Capital','National anthem'].include?( path[-1] ) == false
|
38
|
+
a = Attribute.new
|
39
|
+
a.name = h['name']
|
40
|
+
a.category = category
|
41
|
+
a.path = path
|
42
|
+
|
43
|
+
puts " adding attribute >#{a.name}< using #{a.category} / #{a.path.inspect}"
|
44
|
+
attribs << a
|
45
|
+
|
46
|
+
## note: make sure a modifable copy (of h) gets passed in
|
47
|
+
h.delete( 'name' )
|
48
|
+
end
|
49
|
+
|
50
|
+
return if h.empty? ## empty hash; nothing (more) to do; return
|
51
|
+
|
52
|
+
## continue walking (recursive)
|
53
|
+
h.each do |k,v|
|
54
|
+
new_path = path.dup << k ## note: create a new array (copy)
|
55
|
+
build_attribs( attribs, category, new_path, v )
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
|
60
|
+
def initialize( attribs )
|
61
|
+
@attribs = attribs
|
62
|
+
end
|
63
|
+
|
64
|
+
def to_a() @attribs; end
|
65
|
+
def size() @attribs.size; end
|
66
|
+
|
67
|
+
def each
|
68
|
+
@attribs.each { |attrib| yield( attrib ) }
|
69
|
+
end
|
70
|
+
|
71
|
+
end # class Attributes
|
72
|
+
|
73
|
+
end # module Factbook
|
74
|
+
|
data/lib/factbook/builder.rb
CHANGED
@@ -29,7 +29,7 @@ end
|
|
29
29
|
attr_reader :html_ascii, ## full "original" 1:1 page in "original/ascii8/binary" encoding
|
30
30
|
:html, ## utf-8 encoded profile
|
31
31
|
:html_debug, ## html w/ mapping markers - rename to html_markers - why? why not?
|
32
|
-
:
|
32
|
+
:info, ## page info incl. country_name, region_name, last_updated etc.
|
33
33
|
:errors, ## encoding erros etc.
|
34
34
|
:sects
|
35
35
|
|
@@ -38,7 +38,7 @@ def initialize( html_ascii )
|
|
38
38
|
@html_ascii = html_ascii
|
39
39
|
|
40
40
|
## todo/fix: use/assume windows 12xx?? encoding - change encoding to utf-8 (from binary/ascii8bit)
|
41
|
-
@html, @
|
41
|
+
@html, @info, @errors = Sanitizer.new.sanitize( @html_ascii )
|
42
42
|
|
43
43
|
@html_debug = map_sects( @html )
|
44
44
|
@html_debug = map_subsects( @html_debug )
|
@@ -4,6 +4,7 @@ module Factbook
|
|
4
4
|
|
5
5
|
class ItemBuilder ## renameto ItemReader, ItemParser - why? why not??
|
6
6
|
include LogUtils::Logging
|
7
|
+
include NormalizeHelper ## e.g. normalize_category
|
7
8
|
|
8
9
|
def initialize( html, name )
|
9
10
|
@html = html
|
@@ -42,7 +43,7 @@ def read
|
|
42
43
|
last_node['text'] += " #{text}" ## append w/o separator
|
43
44
|
end
|
44
45
|
else
|
45
|
-
if @name == '
|
46
|
+
if @name == 'Demographic profile' ## special case (use space a sep)
|
46
47
|
last_node['text'] += " #{text}" ## append without (w/o) separator
|
47
48
|
else
|
48
49
|
last_node['text'] += " ++ #{text}" ## append with ++ separator
|
@@ -60,14 +61,11 @@ def read
|
|
60
61
|
## pp spans
|
61
62
|
|
62
63
|
span_key = spans[0] ## assume 1st entry is span.category
|
63
|
-
span_value = spans[1] ## assume 2nd entry is span.category_data
|
64
|
-
|
65
|
-
key = span_key.text
|
66
|
-
|
67
|
-
key = key.strip
|
68
|
-
key = key.sub( /:\z/, '' ) # remove trailing : if present
|
69
|
-
key = key.strip
|
64
|
+
span_value = spans[1] ## assume 2nd entry is span.category_data
|
65
|
+
|
66
|
+
key = normalize_category( span_key.text )
|
70
67
|
|
68
|
+
## note: allow optional category_data for now
|
71
69
|
value = span_value ? span_value.text : nil
|
72
70
|
|
73
71
|
puts "key: >#{key}<, value: >#{value}< : #{value.class.name}"
|
@@ -87,6 +85,7 @@ def read
|
|
87
85
|
pp data
|
88
86
|
data
|
89
87
|
end
|
88
|
+
|
90
89
|
|
91
90
|
end # class ItemBuilder
|
92
91
|
|
@@ -0,0 +1,79 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
module Factbook
|
4
|
+
|
5
|
+
######
|
6
|
+
# json builder -- lets us rebuild a page from "dumped" json (instead of parsing html page)
|
7
|
+
|
8
|
+
class JsonBuilder
|
9
|
+
include LogUtils::Logging
|
10
|
+
include NormalizeHelper ## e.g. normalize_category
|
11
|
+
|
12
|
+
|
13
|
+
def self.from_file( path )
|
14
|
+
text = File.read( path ) ## fix: use File.read_utf8 from textutils
|
15
|
+
self.from_string( text )
|
16
|
+
end
|
17
|
+
|
18
|
+
def self.from_string( text )
|
19
|
+
self.new( text )
|
20
|
+
end
|
21
|
+
|
22
|
+
|
23
|
+
attr_reader :text,
|
24
|
+
:json,
|
25
|
+
:info, ## not used yet -- page info incl. country_name, region_name, last_updated etc.
|
26
|
+
:errors, ## not used yet -- encoding erros etc.
|
27
|
+
:sects
|
28
|
+
|
29
|
+
|
30
|
+
def initialize( text )
|
31
|
+
@text = text
|
32
|
+
|
33
|
+
@json = JSON.parse( text )
|
34
|
+
|
35
|
+
@info = nil ## fix/todo: sorry - for now no page info (use header in json - why? why not??)
|
36
|
+
@errors = [] ## fix/todo: sorry - for now no errors possible/tracked
|
37
|
+
|
38
|
+
@sects = []
|
39
|
+
|
40
|
+
@json.each do |k1,v1|
|
41
|
+
sect_title = k1
|
42
|
+
sect_subsects = v1
|
43
|
+
|
44
|
+
sect = Sect.new
|
45
|
+
sect.title = sect_title
|
46
|
+
|
47
|
+
## get subsections
|
48
|
+
subsects = []
|
49
|
+
sect_subsects.each do |k2,v2|
|
50
|
+
subsect_title = k2
|
51
|
+
subsect_data = v2
|
52
|
+
|
53
|
+
subsect = Subsect.new
|
54
|
+
subsect.title = subsect_title
|
55
|
+
|
56
|
+
#####
|
57
|
+
## note: run data hash through normalize_category (again)
|
58
|
+
if subsect_data.is_a?( Hash )
|
59
|
+
new_subsect_data = {}
|
60
|
+
subsect_data.each do |k3,v3|
|
61
|
+
new_subsect_data[ normalize_category(k3) ] = v3
|
62
|
+
end
|
63
|
+
subsect_data = new_subsect_data
|
64
|
+
end
|
65
|
+
|
66
|
+
subsect.data = subsect_data
|
67
|
+
|
68
|
+
subsects << subsect
|
69
|
+
end
|
70
|
+
|
71
|
+
sect.subsects = subsects
|
72
|
+
@sects << sect
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
end # class JsonBuilder
|
77
|
+
|
78
|
+
|
79
|
+
end # module Factbook
|
@@ -0,0 +1,48 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
module Factbook
|
4
|
+
|
5
|
+
class Counter
|
6
|
+
|
7
|
+
attr_reader :data
|
8
|
+
|
9
|
+
def initialize
|
10
|
+
@data = {}
|
11
|
+
end
|
12
|
+
|
13
|
+
def count( page )
|
14
|
+
|
15
|
+
## walk page data hash
|
16
|
+
# add nodes to data
|
17
|
+
|
18
|
+
walk( page, page.data, @data )
|
19
|
+
end
|
20
|
+
|
21
|
+
|
22
|
+
private
|
23
|
+
def walk( page, hin, hout )
|
24
|
+
hin.each do |k,v|
|
25
|
+
if v.is_a? Hash
|
26
|
+
hout2 = hout[k] || { count: 0, codes: '' }
|
27
|
+
|
28
|
+
hout2[ :count ] += 1
|
29
|
+
|
30
|
+
## delete codes if larger (treshhold) than x (e.g. 9)
|
31
|
+
hout2.delete( :codes ) if hout2[ :count ] > 9
|
32
|
+
|
33
|
+
codes = hout2[ :codes ]
|
34
|
+
if codes ## note: might got deleted if passed treshhold (e.g. 9 entries)
|
35
|
+
codes << ' ' unless codes.empty? ## add separator (space for now)
|
36
|
+
codes << page.info.country_code
|
37
|
+
hout2[ :codes ] = codes
|
38
|
+
end
|
39
|
+
|
40
|
+
hout[k] = hout2
|
41
|
+
walk( page, v, hout2 )
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
end # class Counter
|
47
|
+
|
48
|
+
end # module Factbook
|
@@ -0,0 +1,43 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
module Factbook
|
4
|
+
module NormalizeHelper
|
5
|
+
|
6
|
+
|
7
|
+
def normalize_category( text )
|
8
|
+
|
9
|
+
## note: fix typos/errors with double colons e.g. note:: (instead of note:)
|
10
|
+
|
11
|
+
text = text.strip
|
12
|
+
text = text.sub( /:+\z/, '' ) # remove trailing : if present -- note: allow (fix) note:: too, thus, use :+
|
13
|
+
text = text.strip
|
14
|
+
|
15
|
+
#######################################
|
16
|
+
### special cases
|
17
|
+
|
18
|
+
## typos e.g ntoe => use note
|
19
|
+
text = 'note' if text == 'ntoe'
|
20
|
+
text = 'investment in fixed capital' if text == 'investment if fixed capital'
|
21
|
+
|
22
|
+
## downcase
|
23
|
+
text = 'lowest point' if text == 'Lowest point'
|
24
|
+
text = 'chief of state' if text == 'Chief of state'
|
25
|
+
|
26
|
+
## spelling variant (use more popular one)
|
27
|
+
text = 'signed, but not ratified' if text == 'signed but not ratified'
|
28
|
+
text = 'vectorborne diseases' if text == 'vectorborne disease'
|
29
|
+
text = 'water contact disease' if text == 'water contact diseases'
|
30
|
+
text = 'food or waterborne diseases' if text == 'food or waterborne disease'
|
31
|
+
text = 'geographic coordinates' if text == 'geographical coordinates'
|
32
|
+
text = 'note' if text == 'notes'
|
33
|
+
text = 'refugees (country of origin)' if text == 'refugees (countries of origin)'
|
34
|
+
|
35
|
+
## border countries (8): -- remove (x) counter
|
36
|
+
text = 'border countries' if text.start_with?( 'border countries')
|
37
|
+
|
38
|
+
text
|
39
|
+
end
|
40
|
+
|
41
|
+
|
42
|
+
end # module NormalizeHelper
|
43
|
+
end # module Factbook
|
data/lib/factbook/page.rb
CHANGED
@@ -39,18 +39,31 @@ class Page
|
|
39
39
|
def initialize( code, opts={} )
|
40
40
|
### keep code - why? why not?? (use page_info/info e.g. info.country_code??)
|
41
41
|
|
42
|
-
if opts[:
|
43
|
-
|
44
|
-
|
45
|
-
else
|
46
|
-
|
47
|
-
|
48
|
-
|
42
|
+
if opts[:json]
|
43
|
+
json = opts[:json] ## note: json is (still) a string/text (NOT yet parsed to structured data)
|
44
|
+
b = JsonBuilder.from_string( json )
|
45
|
+
else ## assume html
|
46
|
+
if opts[:html] ## note: expects ASCII-7BIT/BINARY encoding
|
47
|
+
## for debugging and testing allow "custom" passed-in html page
|
48
|
+
html = opts[:html]
|
49
|
+
else
|
50
|
+
url_string = SITE_BASE.gsub( '{code}', code )
|
51
|
+
## note: expects ASCII-7BIT/BINARY encoding
|
52
|
+
html = fetch_page( url_string ) ## use PageFetcher class - why?? why not??
|
53
|
+
end
|
54
|
+
b = Builder.from_string( html )
|
49
55
|
end
|
50
56
|
|
51
|
-
b = Builder.from_string( html )
|
52
57
|
@sects = b.sects
|
53
|
-
@info = b.
|
58
|
+
@info = b.info
|
59
|
+
|
60
|
+
## todo/fix/quick hack:
|
61
|
+
## check for info opts hash entry - lets you overwrite page info
|
62
|
+
## -- use proper header to setup page info - why, why not??
|
63
|
+
if opts[:info]
|
64
|
+
info = opts[:info]
|
65
|
+
@info = info
|
66
|
+
end
|
54
67
|
|
55
68
|
@data = {}
|
56
69
|
@sects.each do |sect|
|
@@ -83,43 +96,22 @@ class Page
|
|
83
96
|
end
|
84
97
|
|
85
98
|
## add convenience (shortcut) accessors / attributes / fields / getters
|
86
|
-
|
87
|
-
ATTRIBUTES = {
|
88
|
-
'Introduction' => [[:background, 'Background' ]],
|
89
|
-
'Geography' => [[:area, 'Area', 'total'], ## convert to number -- why? why not??
|
90
|
-
[:area_land, 'Area', 'land' ],
|
91
|
-
[:area_water, 'Area', 'water'],
|
92
|
-
[:area_note, 'Area', 'note' ],
|
93
|
-
[:area_comparative, 'Area - comparative'],
|
94
|
-
[:climate, 'Climate'],
|
95
|
-
[:terrain, 'Terrain'],
|
96
|
-
[:elevation_lowest, 'Elevation extremes', 'lowest point'],
|
97
|
-
[:elevation_highest,'Elevation extremes', 'highest point'],
|
98
|
-
[:resources, 'Natural resources']],
|
99
|
-
'People and Society' => [[:languages, 'Languages' ],
|
100
|
-
[:religions, 'Religions' ],
|
101
|
-
[:population, 'Population' ],
|
102
|
-
[:population_growth, 'Population growth rate' ],
|
103
|
-
[:birth_rate, 'Birth rate' ],
|
104
|
-
[:death_rate, 'Death rate' ],
|
105
|
-
[:migration_rate, 'Net migration rate' ],
|
106
|
-
[:major_cities, 'Major urban areas - population' ]],
|
107
|
-
}
|
108
99
|
|
109
|
-
ATTRIBUTES.each do |
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
100
|
+
ATTRIBUTES.each do |attrib|
|
101
|
+
## e.g.
|
102
|
+
## def background() data['Introduction']['Background']['text']; end
|
103
|
+
## def location() data['Geography']['Location']['text']; end
|
104
|
+
## etc.
|
105
|
+
if attrib.path.size == 1
|
106
|
+
define_method attrib.name.to_sym do
|
107
|
+
@data.fetch( attrib.category, {} ).
|
108
|
+
fetch( attrib.path[0], {} )['text']
|
109
|
+
end
|
110
|
+
else ## assume size 2 for now
|
111
|
+
define_method attrib.name.to_sym do
|
112
|
+
@data.fetch( attrib.category, {} ).
|
113
|
+
fetch( attrib.path[0], {} ).
|
114
|
+
fetch( attrib.path[1], {} )['text']
|
123
115
|
end
|
124
116
|
end
|
125
117
|
end
|