factbook 1.1.0 → 1.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,72 @@
1
+ # encoding: utf-8
2
+
3
+ module Factbook
4
+
5
+
6
+ class Almanac
7
+
8
+ ## convenience helper ("factory")
9
+ def self.from_json( codes, json_dir: '.' )
10
+ pages = JsonPageReader.new( json_dir ).read_pages( codes )
11
+ self.new( pages )
12
+ end
13
+
14
+
15
+ def initialize( pages )
16
+ @pages = pages
17
+ end
18
+
19
+ def render( template )
20
+ buf = ''
21
+ @pages.each do |page|
22
+ text = PageCtx.new( page, template ).render
23
+
24
+ puts text ## for debugging write country profile to console (too)
25
+ buf << text
26
+ end
27
+ puts "count: #{@pages.count}"
28
+ buf ## return buffered almanac text
29
+ end
30
+
31
+
32
+ class PageCtx
33
+ attr_accessor :page
34
+
35
+ def initialize(page, template)
36
+ @page = page
37
+ @template = template
38
+ end
39
+
40
+ ##############################
41
+ ## add some "view helpers"
42
+
43
+ def name
44
+ ## -- calculate name (use long name if (short) name is not availabe e.g. none)
45
+ ## e.g. Austria
46
+ if @name.nil?
47
+ @name = page.name
48
+ @name = page.name_long if @name == 'none'
49
+ end
50
+ @name
51
+ end
52
+
53
+ def names( separator: ' • ' )
54
+ ## e.g. Austria • Österreich
55
+ if @names.nil?
56
+ if page.name_local.blank? || page.name_local == 'none' || page.name_local == name
57
+ @names = [name] ## no local (in its own non-english language) name
58
+ else
59
+ @names = [name, page.name_local]
60
+ end
61
+ end
62
+ @names.join( separator )
63
+ end
64
+
65
+ def render
66
+ ERB.new( @template).result( binding )
67
+ end
68
+ end ## PageCtx
69
+
70
+ end ## Almanac
71
+
72
+ end # module Factbook
@@ -0,0 +1,74 @@
1
+ # encoding: utf-8
2
+
3
+
4
+ module Factbook
5
+
6
+ class Attributes
7
+
8
+ Attribute = Struct.new( :name,
9
+ :category, ## e.g. Introduction, Geography, etc.
10
+ :path, ## note: is an array e.g. ["Area - comparative"] or ["Area", "land"] etc.
11
+ )
12
+
13
+ def self.from_yaml( path )
14
+
15
+ h = YAML.load_file( path )
16
+ pp h
17
+
18
+ attribs = []
19
+
20
+ ## note: use a copy (e.g. h.dup) for now (hash gets changed by build_attribs!!)
21
+ new_h = h.dup
22
+ new_h.each do |k,v|
23
+ category = k
24
+ build_attribs( attribs, category, [], v )
25
+ end
26
+
27
+ self.new( attribs )
28
+ end
29
+
30
+
31
+ def self.build_attribs( attribs, category, path, h )
32
+
33
+ ## assume it's an attribute definition hash
34
+ ## note: !! exclude special cases:
35
+ ## Capital -- incl. name key itself
36
+ ## National anthem
37
+ if h.has_key?( 'name' ) && ['Capital','National anthem'].include?( path[-1] ) == false
38
+ a = Attribute.new
39
+ a.name = h['name']
40
+ a.category = category
41
+ a.path = path
42
+
43
+ puts " adding attribute >#{a.name}< using #{a.category} / #{a.path.inspect}"
44
+ attribs << a
45
+
46
+ ## note: make sure a modifable copy (of h) gets passed in
47
+ h.delete( 'name' )
48
+ end
49
+
50
+ return if h.empty? ## empty hash; nothing (more) to do; return
51
+
52
+ ## continue walking (recursive)
53
+ h.each do |k,v|
54
+ new_path = path.dup << k ## note: create a new array (copy)
55
+ build_attribs( attribs, category, new_path, v )
56
+ end
57
+ end
58
+
59
+
60
+ def initialize( attribs )
61
+ @attribs = attribs
62
+ end
63
+
64
+ def to_a() @attribs; end
65
+ def size() @attribs.size; end
66
+
67
+ def each
68
+ @attribs.each { |attrib| yield( attrib ) }
69
+ end
70
+
71
+ end # class Attributes
72
+
73
+ end # module Factbook
74
+
@@ -29,7 +29,7 @@ end
29
29
  attr_reader :html_ascii, ## full "original" 1:1 page in "original/ascii8/binary" encoding
30
30
  :html, ## utf-8 encoded profile
31
31
  :html_debug, ## html w/ mapping markers - rename to html_markers - why? why not?
32
- :page_info, ## incl. country_name, region_name, last_updated etc.
32
+ :info, ## page info incl. country_name, region_name, last_updated etc.
33
33
  :errors, ## encoding erros etc.
34
34
  :sects
35
35
 
@@ -38,7 +38,7 @@ def initialize( html_ascii )
38
38
  @html_ascii = html_ascii
39
39
 
40
40
  ## todo/fix: use/assume windows 12xx?? encoding - change encoding to utf-8 (from binary/ascii8bit)
41
- @html, @page_info, @errors = Sanitizer.new.sanitize( @html_ascii )
41
+ @html, @info, @errors = Sanitizer.new.sanitize( @html_ascii )
42
42
 
43
43
  @html_debug = map_sects( @html )
44
44
  @html_debug = map_subsects( @html_debug )
@@ -4,6 +4,7 @@ module Factbook
4
4
 
5
5
  class ItemBuilder ## renameto ItemReader, ItemParser - why? why not??
6
6
  include LogUtils::Logging
7
+ include NormalizeHelper ## e.g. normalize_category
7
8
 
8
9
  def initialize( html, name )
9
10
  @html = html
@@ -42,7 +43,7 @@ def read
42
43
  last_node['text'] += " #{text}" ## append w/o separator
43
44
  end
44
45
  else
45
- if @name == 'demographic_profile' || @name == 'Demographic profile' ## special case (use space a sep)
46
+ if @name == 'Demographic profile' ## special case (use space a sep)
46
47
  last_node['text'] += " #{text}" ## append without (w/o) separator
47
48
  else
48
49
  last_node['text'] += " ++ #{text}" ## append with ++ separator
@@ -60,14 +61,11 @@ def read
60
61
  ## pp spans
61
62
 
62
63
  span_key = spans[0] ## assume 1st entry is span.category
63
- span_value = spans[1] ## assume 2nd entry is span.category_data')
64
- ## allow optional category_data for now
65
- key = span_key.text
66
-
67
- key = key.strip
68
- key = key.sub( /:\z/, '' ) # remove trailing : if present
69
- key = key.strip
64
+ span_value = spans[1] ## assume 2nd entry is span.category_data
65
+
66
+ key = normalize_category( span_key.text )
70
67
 
68
+ ## note: allow optional category_data for now
71
69
  value = span_value ? span_value.text : nil
72
70
 
73
71
  puts "key: >#{key}<, value: >#{value}< : #{value.class.name}"
@@ -87,6 +85,7 @@ def read
87
85
  pp data
88
86
  data
89
87
  end
88
+
90
89
 
91
90
  end # class ItemBuilder
92
91
 
@@ -0,0 +1,79 @@
1
+ # encoding: utf-8
2
+
3
+ module Factbook
4
+
5
+ ######
6
+ # json builder -- lets us rebuild a page from "dumped" json (instead of parsing html page)
7
+
8
+ class JsonBuilder
9
+ include LogUtils::Logging
10
+ include NormalizeHelper ## e.g. normalize_category
11
+
12
+
13
+ def self.from_file( path )
14
+ text = File.read( path ) ## fix: use File.read_utf8 from textutils
15
+ self.from_string( text )
16
+ end
17
+
18
+ def self.from_string( text )
19
+ self.new( text )
20
+ end
21
+
22
+
23
+ attr_reader :text,
24
+ :json,
25
+ :info, ## not used yet -- page info incl. country_name, region_name, last_updated etc.
26
+ :errors, ## not used yet -- encoding erros etc.
27
+ :sects
28
+
29
+
30
+ def initialize( text )
31
+ @text = text
32
+
33
+ @json = JSON.parse( text )
34
+
35
+ @info = nil ## fix/todo: sorry - for now no page info (use header in json - why? why not??)
36
+ @errors = [] ## fix/todo: sorry - for now no errors possible/tracked
37
+
38
+ @sects = []
39
+
40
+ @json.each do |k1,v1|
41
+ sect_title = k1
42
+ sect_subsects = v1
43
+
44
+ sect = Sect.new
45
+ sect.title = sect_title
46
+
47
+ ## get subsections
48
+ subsects = []
49
+ sect_subsects.each do |k2,v2|
50
+ subsect_title = k2
51
+ subsect_data = v2
52
+
53
+ subsect = Subsect.new
54
+ subsect.title = subsect_title
55
+
56
+ #####
57
+ ## note: run data hash through normalize_category (again)
58
+ if subsect_data.is_a?( Hash )
59
+ new_subsect_data = {}
60
+ subsect_data.each do |k3,v3|
61
+ new_subsect_data[ normalize_category(k3) ] = v3
62
+ end
63
+ subsect_data = new_subsect_data
64
+ end
65
+
66
+ subsect.data = subsect_data
67
+
68
+ subsects << subsect
69
+ end
70
+
71
+ sect.subsects = subsects
72
+ @sects << sect
73
+ end
74
+ end
75
+
76
+ end # class JsonBuilder
77
+
78
+
79
+ end # module Factbook
@@ -0,0 +1,48 @@
1
+ # encoding: utf-8
2
+
3
+ module Factbook
4
+
5
+ class Counter
6
+
7
+ attr_reader :data
8
+
9
+ def initialize
10
+ @data = {}
11
+ end
12
+
13
+ def count( page )
14
+
15
+ ## walk page data hash
16
+ # add nodes to data
17
+
18
+ walk( page, page.data, @data )
19
+ end
20
+
21
+
22
+ private
23
+ def walk( page, hin, hout )
24
+ hin.each do |k,v|
25
+ if v.is_a? Hash
26
+ hout2 = hout[k] || { count: 0, codes: '' }
27
+
28
+ hout2[ :count ] += 1
29
+
30
+ ## delete codes if larger (treshhold) than x (e.g. 9)
31
+ hout2.delete( :codes ) if hout2[ :count ] > 9
32
+
33
+ codes = hout2[ :codes ]
34
+ if codes ## note: might got deleted if passed treshhold (e.g. 9 entries)
35
+ codes << ' ' unless codes.empty? ## add separator (space for now)
36
+ codes << page.info.country_code
37
+ hout2[ :codes ] = codes
38
+ end
39
+
40
+ hout[k] = hout2
41
+ walk( page, v, hout2 )
42
+ end
43
+ end
44
+ end
45
+
46
+ end # class Counter
47
+
48
+ end # module Factbook
@@ -0,0 +1,43 @@
1
+ # encoding: utf-8
2
+
3
+ module Factbook
4
+ module NormalizeHelper
5
+
6
+
7
+ def normalize_category( text )
8
+
9
+ ## note: fix typos/errors with double colons e.g. note:: (instead of note:)
10
+
11
+ text = text.strip
12
+ text = text.sub( /:+\z/, '' ) # remove trailing : if present -- note: allow (fix) note:: too, thus, use :+
13
+ text = text.strip
14
+
15
+ #######################################
16
+ ### special cases
17
+
18
+ ## typos e.g ntoe => use note
19
+ text = 'note' if text == 'ntoe'
20
+ text = 'investment in fixed capital' if text == 'investment if fixed capital'
21
+
22
+ ## downcase
23
+ text = 'lowest point' if text == 'Lowest point'
24
+ text = 'chief of state' if text == 'Chief of state'
25
+
26
+ ## spelling variant (use more popular one)
27
+ text = 'signed, but not ratified' if text == 'signed but not ratified'
28
+ text = 'vectorborne diseases' if text == 'vectorborne disease'
29
+ text = 'water contact disease' if text == 'water contact diseases'
30
+ text = 'food or waterborne diseases' if text == 'food or waterborne disease'
31
+ text = 'geographic coordinates' if text == 'geographical coordinates'
32
+ text = 'note' if text == 'notes'
33
+ text = 'refugees (country of origin)' if text == 'refugees (countries of origin)'
34
+
35
+ ## border countries (8): -- remove (x) counter
36
+ text = 'border countries' if text.start_with?( 'border countries')
37
+
38
+ text
39
+ end
40
+
41
+
42
+ end # module NormalizeHelper
43
+ end # module Factbook
@@ -39,18 +39,31 @@ class Page
39
39
  def initialize( code, opts={} )
40
40
  ### keep code - why? why not?? (use page_info/info e.g. info.country_code??)
41
41
 
42
- if opts[:html] ## note: expects ASCII-7BIT/BINARY encoding
43
- ## for debugging and testing allow "custom" passed-in html page
44
- html = opts[:html]
45
- else
46
- url_string = SITE_BASE.gsub( '{code}', code )
47
- ## note: expects ASCII-7BIT/BINARY encoding
48
- html = fetch_page( url_string ) ## use PageFetcher class - why?? why not??
42
+ if opts[:json]
43
+ json = opts[:json] ## note: json is (still) a string/text (NOT yet parsed to structured data)
44
+ b = JsonBuilder.from_string( json )
45
+ else ## assume html
46
+ if opts[:html] ## note: expects ASCII-7BIT/BINARY encoding
47
+ ## for debugging and testing allow "custom" passed-in html page
48
+ html = opts[:html]
49
+ else
50
+ url_string = SITE_BASE.gsub( '{code}', code )
51
+ ## note: expects ASCII-7BIT/BINARY encoding
52
+ html = fetch_page( url_string ) ## use PageFetcher class - why?? why not??
53
+ end
54
+ b = Builder.from_string( html )
49
55
  end
50
56
 
51
- b = Builder.from_string( html )
52
57
  @sects = b.sects
53
- @info = b.page_info ## todo: change b.page_info to info too - why? why not??
58
+ @info = b.info
59
+
60
+ ## todo/fix/quick hack:
61
+ ## check for info opts hash entry - lets you overwrite page info
62
+ ## -- use proper header to setup page info - why, why not??
63
+ if opts[:info]
64
+ info = opts[:info]
65
+ @info = info
66
+ end
54
67
 
55
68
  @data = {}
56
69
  @sects.each do |sect|
@@ -83,43 +96,22 @@ class Page
83
96
  end
84
97
 
85
98
  ## add convenience (shortcut) accessors / attributes / fields / getters
86
-
87
- ATTRIBUTES = {
88
- 'Introduction' => [[:background, 'Background' ]],
89
- 'Geography' => [[:area, 'Area', 'total'], ## convert to number -- why? why not??
90
- [:area_land, 'Area', 'land' ],
91
- [:area_water, 'Area', 'water'],
92
- [:area_note, 'Area', 'note' ],
93
- [:area_comparative, 'Area - comparative'],
94
- [:climate, 'Climate'],
95
- [:terrain, 'Terrain'],
96
- [:elevation_lowest, 'Elevation extremes', 'lowest point'],
97
- [:elevation_highest,'Elevation extremes', 'highest point'],
98
- [:resources, 'Natural resources']],
99
- 'People and Society' => [[:languages, 'Languages' ],
100
- [:religions, 'Religions' ],
101
- [:population, 'Population' ],
102
- [:population_growth, 'Population growth rate' ],
103
- [:birth_rate, 'Birth rate' ],
104
- [:death_rate, 'Death rate' ],
105
- [:migration_rate, 'Net migration rate' ],
106
- [:major_cities, 'Major urban areas - population' ]],
107
- }
108
99
 
109
- ATTRIBUTES.each do |section_title, attribs|
110
- attribs.each do |attrib|
111
- ## e.g.
112
- ## def background() data['Introduction']['Background']['text']; end
113
- ## def location() data['Geography']['Location']['text']; end
114
- ## etc.
115
- if attrib.size == 2
116
- define_method attrib[0] do
117
- @data.fetch( section_title, {} ).fetch( attrib[1], {} )['text']
118
- end
119
- else ## assume size 3 for now
120
- define_method attrib[0] do
121
- @data.fetch( section_title, {} ).fetch( attrib[1], {} ).fetch( attrib[2], {} )['text']
122
- end
100
+ ATTRIBUTES.each do |attrib|
101
+ ## e.g.
102
+ ## def background() data['Introduction']['Background']['text']; end
103
+ ## def location() data['Geography']['Location']['text']; end
104
+ ## etc.
105
+ if attrib.path.size == 1
106
+ define_method attrib.name.to_sym do
107
+ @data.fetch( attrib.category, {} ).
108
+ fetch( attrib.path[0], {} )['text']
109
+ end
110
+ else ## assume size 2 for now
111
+ define_method attrib.name.to_sym do
112
+ @data.fetch( attrib.category, {} ).
113
+ fetch( attrib.path[0], {} ).
114
+ fetch( attrib.path[1], {} )['text']
123
115
  end
124
116
  end
125
117
  end