factbook 1.1.0 → 1.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Manifest.txt +18 -0
- data/README.md +7 -0
- data/data/attributes.yml +337 -0
- data/data/categories.csv +1 -1
- data/lib/factbook.rb +29 -14
- data/lib/factbook/almanac.rb +72 -0
- data/lib/factbook/attributes.rb +74 -0
- data/lib/factbook/builder.rb +2 -2
- data/lib/factbook/builder_item.rb +7 -8
- data/lib/factbook/builder_json.rb +79 -0
- data/lib/factbook/counter.rb +48 -0
- data/lib/factbook/normalize.rb +43 -0
- data/lib/factbook/page.rb +37 -45
- data/lib/factbook/page_info.rb +12 -0
- data/lib/factbook/reader_json.rb +51 -0
- data/lib/factbook/sanitizer.rb +0 -7
- data/lib/factbook/version.rb +1 -1
- data/script/almanac.rb +48 -0
- data/script/attributes.rb +34 -0
- data/script/build.rb +28 -0
- data/script/counter.rb +145 -0
- data/script/json.rb +18 -0
- data/test/data/json/au.json +892 -0
- data/test/test_attribs.rb +33 -2
- data/test/test_attribs_def.rb +20 -0
- data/test/test_counter.rb +31 -0
- data/test/test_json_builder.rb +25 -0
- data/test/test_normalize.rb +23 -0
- metadata +20 -2
@@ -0,0 +1,72 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
module Factbook
|
4
|
+
|
5
|
+
|
6
|
+
class Almanac
|
7
|
+
|
8
|
+
## convenience helper ("factory")
|
9
|
+
def self.from_json( codes, json_dir: '.' )
|
10
|
+
pages = JsonPageReader.new( json_dir ).read_pages( codes )
|
11
|
+
self.new( pages )
|
12
|
+
end
|
13
|
+
|
14
|
+
|
15
|
+
def initialize( pages )
|
16
|
+
@pages = pages
|
17
|
+
end
|
18
|
+
|
19
|
+
def render( template )
|
20
|
+
buf = ''
|
21
|
+
@pages.each do |page|
|
22
|
+
text = PageCtx.new( page, template ).render
|
23
|
+
|
24
|
+
puts text ## for debugging write country profile to console (too)
|
25
|
+
buf << text
|
26
|
+
end
|
27
|
+
puts "count: #{@pages.count}"
|
28
|
+
buf ## return buffered almanac text
|
29
|
+
end
|
30
|
+
|
31
|
+
|
32
|
+
class PageCtx
|
33
|
+
attr_accessor :page
|
34
|
+
|
35
|
+
def initialize(page, template)
|
36
|
+
@page = page
|
37
|
+
@template = template
|
38
|
+
end
|
39
|
+
|
40
|
+
##############################
|
41
|
+
## add some "view helpers"
|
42
|
+
|
43
|
+
def name
|
44
|
+
## -- calculate name (use long name if (short) name is not availabe e.g. none)
|
45
|
+
## e.g. Austria
|
46
|
+
if @name.nil?
|
47
|
+
@name = page.name
|
48
|
+
@name = page.name_long if @name == 'none'
|
49
|
+
end
|
50
|
+
@name
|
51
|
+
end
|
52
|
+
|
53
|
+
def names( separator: ' • ' )
|
54
|
+
## e.g. Austria • Österreich
|
55
|
+
if @names.nil?
|
56
|
+
if page.name_local.blank? || page.name_local == 'none' || page.name_local == name
|
57
|
+
@names = [name] ## no local (in its own non-english language) name
|
58
|
+
else
|
59
|
+
@names = [name, page.name_local]
|
60
|
+
end
|
61
|
+
end
|
62
|
+
@names.join( separator )
|
63
|
+
end
|
64
|
+
|
65
|
+
def render
|
66
|
+
ERB.new( @template).result( binding )
|
67
|
+
end
|
68
|
+
end ## PageCtx
|
69
|
+
|
70
|
+
end ## Almanac
|
71
|
+
|
72
|
+
end # module Factbook
|
@@ -0,0 +1,74 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
|
4
|
+
module Factbook
|
5
|
+
|
6
|
+
class Attributes
|
7
|
+
|
8
|
+
Attribute = Struct.new( :name,
|
9
|
+
:category, ## e.g. Introduction, Geography, etc.
|
10
|
+
:path, ## note: is an array e.g. ["Area - comparative"] or ["Area", "land"] etc.
|
11
|
+
)
|
12
|
+
|
13
|
+
def self.from_yaml( path )
|
14
|
+
|
15
|
+
h = YAML.load_file( path )
|
16
|
+
pp h
|
17
|
+
|
18
|
+
attribs = []
|
19
|
+
|
20
|
+
## note: use a copy (e.g. h.dup) for now (hash gets changed by build_attribs!!)
|
21
|
+
new_h = h.dup
|
22
|
+
new_h.each do |k,v|
|
23
|
+
category = k
|
24
|
+
build_attribs( attribs, category, [], v )
|
25
|
+
end
|
26
|
+
|
27
|
+
self.new( attribs )
|
28
|
+
end
|
29
|
+
|
30
|
+
|
31
|
+
def self.build_attribs( attribs, category, path, h )
|
32
|
+
|
33
|
+
## assume it's an attribute definition hash
|
34
|
+
## note: !! exclude special cases:
|
35
|
+
## Capital -- incl. name key itself
|
36
|
+
## National anthem
|
37
|
+
if h.has_key?( 'name' ) && ['Capital','National anthem'].include?( path[-1] ) == false
|
38
|
+
a = Attribute.new
|
39
|
+
a.name = h['name']
|
40
|
+
a.category = category
|
41
|
+
a.path = path
|
42
|
+
|
43
|
+
puts " adding attribute >#{a.name}< using #{a.category} / #{a.path.inspect}"
|
44
|
+
attribs << a
|
45
|
+
|
46
|
+
## note: make sure a modifable copy (of h) gets passed in
|
47
|
+
h.delete( 'name' )
|
48
|
+
end
|
49
|
+
|
50
|
+
return if h.empty? ## empty hash; nothing (more) to do; return
|
51
|
+
|
52
|
+
## continue walking (recursive)
|
53
|
+
h.each do |k,v|
|
54
|
+
new_path = path.dup << k ## note: create a new array (copy)
|
55
|
+
build_attribs( attribs, category, new_path, v )
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
|
60
|
+
def initialize( attribs )
|
61
|
+
@attribs = attribs
|
62
|
+
end
|
63
|
+
|
64
|
+
def to_a() @attribs; end
|
65
|
+
def size() @attribs.size; end
|
66
|
+
|
67
|
+
def each
|
68
|
+
@attribs.each { |attrib| yield( attrib ) }
|
69
|
+
end
|
70
|
+
|
71
|
+
end # class Attributes
|
72
|
+
|
73
|
+
end # module Factbook
|
74
|
+
|
data/lib/factbook/builder.rb
CHANGED
@@ -29,7 +29,7 @@ end
|
|
29
29
|
attr_reader :html_ascii, ## full "original" 1:1 page in "original/ascii8/binary" encoding
|
30
30
|
:html, ## utf-8 encoded profile
|
31
31
|
:html_debug, ## html w/ mapping markers - rename to html_markers - why? why not?
|
32
|
-
:
|
32
|
+
:info, ## page info incl. country_name, region_name, last_updated etc.
|
33
33
|
:errors, ## encoding erros etc.
|
34
34
|
:sects
|
35
35
|
|
@@ -38,7 +38,7 @@ def initialize( html_ascii )
|
|
38
38
|
@html_ascii = html_ascii
|
39
39
|
|
40
40
|
## todo/fix: use/assume windows 12xx?? encoding - change encoding to utf-8 (from binary/ascii8bit)
|
41
|
-
@html, @
|
41
|
+
@html, @info, @errors = Sanitizer.new.sanitize( @html_ascii )
|
42
42
|
|
43
43
|
@html_debug = map_sects( @html )
|
44
44
|
@html_debug = map_subsects( @html_debug )
|
@@ -4,6 +4,7 @@ module Factbook
|
|
4
4
|
|
5
5
|
class ItemBuilder ## renameto ItemReader, ItemParser - why? why not??
|
6
6
|
include LogUtils::Logging
|
7
|
+
include NormalizeHelper ## e.g. normalize_category
|
7
8
|
|
8
9
|
def initialize( html, name )
|
9
10
|
@html = html
|
@@ -42,7 +43,7 @@ def read
|
|
42
43
|
last_node['text'] += " #{text}" ## append w/o separator
|
43
44
|
end
|
44
45
|
else
|
45
|
-
if @name == '
|
46
|
+
if @name == 'Demographic profile' ## special case (use space a sep)
|
46
47
|
last_node['text'] += " #{text}" ## append without (w/o) separator
|
47
48
|
else
|
48
49
|
last_node['text'] += " ++ #{text}" ## append with ++ separator
|
@@ -60,14 +61,11 @@ def read
|
|
60
61
|
## pp spans
|
61
62
|
|
62
63
|
span_key = spans[0] ## assume 1st entry is span.category
|
63
|
-
span_value = spans[1] ## assume 2nd entry is span.category_data
|
64
|
-
|
65
|
-
key = span_key.text
|
66
|
-
|
67
|
-
key = key.strip
|
68
|
-
key = key.sub( /:\z/, '' ) # remove trailing : if present
|
69
|
-
key = key.strip
|
64
|
+
span_value = spans[1] ## assume 2nd entry is span.category_data
|
65
|
+
|
66
|
+
key = normalize_category( span_key.text )
|
70
67
|
|
68
|
+
## note: allow optional category_data for now
|
71
69
|
value = span_value ? span_value.text : nil
|
72
70
|
|
73
71
|
puts "key: >#{key}<, value: >#{value}< : #{value.class.name}"
|
@@ -87,6 +85,7 @@ def read
|
|
87
85
|
pp data
|
88
86
|
data
|
89
87
|
end
|
88
|
+
|
90
89
|
|
91
90
|
end # class ItemBuilder
|
92
91
|
|
@@ -0,0 +1,79 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
module Factbook
|
4
|
+
|
5
|
+
######
|
6
|
+
# json builder -- lets us rebuild a page from "dumped" json (instead of parsing html page)
|
7
|
+
|
8
|
+
class JsonBuilder
|
9
|
+
include LogUtils::Logging
|
10
|
+
include NormalizeHelper ## e.g. normalize_category
|
11
|
+
|
12
|
+
|
13
|
+
def self.from_file( path )
|
14
|
+
text = File.read( path ) ## fix: use File.read_utf8 from textutils
|
15
|
+
self.from_string( text )
|
16
|
+
end
|
17
|
+
|
18
|
+
def self.from_string( text )
|
19
|
+
self.new( text )
|
20
|
+
end
|
21
|
+
|
22
|
+
|
23
|
+
attr_reader :text,
|
24
|
+
:json,
|
25
|
+
:info, ## not used yet -- page info incl. country_name, region_name, last_updated etc.
|
26
|
+
:errors, ## not used yet -- encoding erros etc.
|
27
|
+
:sects
|
28
|
+
|
29
|
+
|
30
|
+
def initialize( text )
|
31
|
+
@text = text
|
32
|
+
|
33
|
+
@json = JSON.parse( text )
|
34
|
+
|
35
|
+
@info = nil ## fix/todo: sorry - for now no page info (use header in json - why? why not??)
|
36
|
+
@errors = [] ## fix/todo: sorry - for now no errors possible/tracked
|
37
|
+
|
38
|
+
@sects = []
|
39
|
+
|
40
|
+
@json.each do |k1,v1|
|
41
|
+
sect_title = k1
|
42
|
+
sect_subsects = v1
|
43
|
+
|
44
|
+
sect = Sect.new
|
45
|
+
sect.title = sect_title
|
46
|
+
|
47
|
+
## get subsections
|
48
|
+
subsects = []
|
49
|
+
sect_subsects.each do |k2,v2|
|
50
|
+
subsect_title = k2
|
51
|
+
subsect_data = v2
|
52
|
+
|
53
|
+
subsect = Subsect.new
|
54
|
+
subsect.title = subsect_title
|
55
|
+
|
56
|
+
#####
|
57
|
+
## note: run data hash through normalize_category (again)
|
58
|
+
if subsect_data.is_a?( Hash )
|
59
|
+
new_subsect_data = {}
|
60
|
+
subsect_data.each do |k3,v3|
|
61
|
+
new_subsect_data[ normalize_category(k3) ] = v3
|
62
|
+
end
|
63
|
+
subsect_data = new_subsect_data
|
64
|
+
end
|
65
|
+
|
66
|
+
subsect.data = subsect_data
|
67
|
+
|
68
|
+
subsects << subsect
|
69
|
+
end
|
70
|
+
|
71
|
+
sect.subsects = subsects
|
72
|
+
@sects << sect
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
end # class JsonBuilder
|
77
|
+
|
78
|
+
|
79
|
+
end # module Factbook
|
@@ -0,0 +1,48 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
module Factbook
|
4
|
+
|
5
|
+
class Counter
|
6
|
+
|
7
|
+
attr_reader :data
|
8
|
+
|
9
|
+
def initialize
|
10
|
+
@data = {}
|
11
|
+
end
|
12
|
+
|
13
|
+
def count( page )
|
14
|
+
|
15
|
+
## walk page data hash
|
16
|
+
# add nodes to data
|
17
|
+
|
18
|
+
walk( page, page.data, @data )
|
19
|
+
end
|
20
|
+
|
21
|
+
|
22
|
+
private
|
23
|
+
def walk( page, hin, hout )
|
24
|
+
hin.each do |k,v|
|
25
|
+
if v.is_a? Hash
|
26
|
+
hout2 = hout[k] || { count: 0, codes: '' }
|
27
|
+
|
28
|
+
hout2[ :count ] += 1
|
29
|
+
|
30
|
+
## delete codes if larger (treshhold) than x (e.g. 9)
|
31
|
+
hout2.delete( :codes ) if hout2[ :count ] > 9
|
32
|
+
|
33
|
+
codes = hout2[ :codes ]
|
34
|
+
if codes ## note: might got deleted if passed treshhold (e.g. 9 entries)
|
35
|
+
codes << ' ' unless codes.empty? ## add separator (space for now)
|
36
|
+
codes << page.info.country_code
|
37
|
+
hout2[ :codes ] = codes
|
38
|
+
end
|
39
|
+
|
40
|
+
hout[k] = hout2
|
41
|
+
walk( page, v, hout2 )
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
end # class Counter
|
47
|
+
|
48
|
+
end # module Factbook
|
@@ -0,0 +1,43 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
module Factbook
|
4
|
+
module NormalizeHelper
|
5
|
+
|
6
|
+
|
7
|
+
def normalize_category( text )
|
8
|
+
|
9
|
+
## note: fix typos/errors with double colons e.g. note:: (instead of note:)
|
10
|
+
|
11
|
+
text = text.strip
|
12
|
+
text = text.sub( /:+\z/, '' ) # remove trailing : if present -- note: allow (fix) note:: too, thus, use :+
|
13
|
+
text = text.strip
|
14
|
+
|
15
|
+
#######################################
|
16
|
+
### special cases
|
17
|
+
|
18
|
+
## typos e.g ntoe => use note
|
19
|
+
text = 'note' if text == 'ntoe'
|
20
|
+
text = 'investment in fixed capital' if text == 'investment if fixed capital'
|
21
|
+
|
22
|
+
## downcase
|
23
|
+
text = 'lowest point' if text == 'Lowest point'
|
24
|
+
text = 'chief of state' if text == 'Chief of state'
|
25
|
+
|
26
|
+
## spelling variant (use more popular one)
|
27
|
+
text = 'signed, but not ratified' if text == 'signed but not ratified'
|
28
|
+
text = 'vectorborne diseases' if text == 'vectorborne disease'
|
29
|
+
text = 'water contact disease' if text == 'water contact diseases'
|
30
|
+
text = 'food or waterborne diseases' if text == 'food or waterborne disease'
|
31
|
+
text = 'geographic coordinates' if text == 'geographical coordinates'
|
32
|
+
text = 'note' if text == 'notes'
|
33
|
+
text = 'refugees (country of origin)' if text == 'refugees (countries of origin)'
|
34
|
+
|
35
|
+
## border countries (8): -- remove (x) counter
|
36
|
+
text = 'border countries' if text.start_with?( 'border countries')
|
37
|
+
|
38
|
+
text
|
39
|
+
end
|
40
|
+
|
41
|
+
|
42
|
+
end # module NormalizeHelper
|
43
|
+
end # module Factbook
|
data/lib/factbook/page.rb
CHANGED
@@ -39,18 +39,31 @@ class Page
|
|
39
39
|
def initialize( code, opts={} )
|
40
40
|
### keep code - why? why not?? (use page_info/info e.g. info.country_code??)
|
41
41
|
|
42
|
-
if opts[:
|
43
|
-
|
44
|
-
|
45
|
-
else
|
46
|
-
|
47
|
-
|
48
|
-
|
42
|
+
if opts[:json]
|
43
|
+
json = opts[:json] ## note: json is (still) a string/text (NOT yet parsed to structured data)
|
44
|
+
b = JsonBuilder.from_string( json )
|
45
|
+
else ## assume html
|
46
|
+
if opts[:html] ## note: expects ASCII-7BIT/BINARY encoding
|
47
|
+
## for debugging and testing allow "custom" passed-in html page
|
48
|
+
html = opts[:html]
|
49
|
+
else
|
50
|
+
url_string = SITE_BASE.gsub( '{code}', code )
|
51
|
+
## note: expects ASCII-7BIT/BINARY encoding
|
52
|
+
html = fetch_page( url_string ) ## use PageFetcher class - why?? why not??
|
53
|
+
end
|
54
|
+
b = Builder.from_string( html )
|
49
55
|
end
|
50
56
|
|
51
|
-
b = Builder.from_string( html )
|
52
57
|
@sects = b.sects
|
53
|
-
@info = b.
|
58
|
+
@info = b.info
|
59
|
+
|
60
|
+
## todo/fix/quick hack:
|
61
|
+
## check for info opts hash entry - lets you overwrite page info
|
62
|
+
## -- use proper header to setup page info - why, why not??
|
63
|
+
if opts[:info]
|
64
|
+
info = opts[:info]
|
65
|
+
@info = info
|
66
|
+
end
|
54
67
|
|
55
68
|
@data = {}
|
56
69
|
@sects.each do |sect|
|
@@ -83,43 +96,22 @@ class Page
|
|
83
96
|
end
|
84
97
|
|
85
98
|
## add convenience (shortcut) accessors / attributes / fields / getters
|
86
|
-
|
87
|
-
ATTRIBUTES = {
|
88
|
-
'Introduction' => [[:background, 'Background' ]],
|
89
|
-
'Geography' => [[:area, 'Area', 'total'], ## convert to number -- why? why not??
|
90
|
-
[:area_land, 'Area', 'land' ],
|
91
|
-
[:area_water, 'Area', 'water'],
|
92
|
-
[:area_note, 'Area', 'note' ],
|
93
|
-
[:area_comparative, 'Area - comparative'],
|
94
|
-
[:climate, 'Climate'],
|
95
|
-
[:terrain, 'Terrain'],
|
96
|
-
[:elevation_lowest, 'Elevation extremes', 'lowest point'],
|
97
|
-
[:elevation_highest,'Elevation extremes', 'highest point'],
|
98
|
-
[:resources, 'Natural resources']],
|
99
|
-
'People and Society' => [[:languages, 'Languages' ],
|
100
|
-
[:religions, 'Religions' ],
|
101
|
-
[:population, 'Population' ],
|
102
|
-
[:population_growth, 'Population growth rate' ],
|
103
|
-
[:birth_rate, 'Birth rate' ],
|
104
|
-
[:death_rate, 'Death rate' ],
|
105
|
-
[:migration_rate, 'Net migration rate' ],
|
106
|
-
[:major_cities, 'Major urban areas - population' ]],
|
107
|
-
}
|
108
99
|
|
109
|
-
ATTRIBUTES.each do |
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
100
|
+
ATTRIBUTES.each do |attrib|
|
101
|
+
## e.g.
|
102
|
+
## def background() data['Introduction']['Background']['text']; end
|
103
|
+
## def location() data['Geography']['Location']['text']; end
|
104
|
+
## etc.
|
105
|
+
if attrib.path.size == 1
|
106
|
+
define_method attrib.name.to_sym do
|
107
|
+
@data.fetch( attrib.category, {} ).
|
108
|
+
fetch( attrib.path[0], {} )['text']
|
109
|
+
end
|
110
|
+
else ## assume size 2 for now
|
111
|
+
define_method attrib.name.to_sym do
|
112
|
+
@data.fetch( attrib.category, {} ).
|
113
|
+
fetch( attrib.path[0], {} ).
|
114
|
+
fetch( attrib.path[1], {} )['text']
|
123
115
|
end
|
124
116
|
end
|
125
117
|
end
|