factbook 1.1.0 → 1.1.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,72 @@
1
+ # encoding: utf-8
2
+
3
+ module Factbook
4
+
5
+
6
+ class Almanac
7
+
8
+ ## convenience helper ("factory")
9
+ def self.from_json( codes, json_dir: '.' )
10
+ pages = JsonPageReader.new( json_dir ).read_pages( codes )
11
+ self.new( pages )
12
+ end
13
+
14
+
15
+ def initialize( pages )
16
+ @pages = pages
17
+ end
18
+
19
+ def render( template )
20
+ buf = ''
21
+ @pages.each do |page|
22
+ text = PageCtx.new( page, template ).render
23
+
24
+ puts text ## for debugging write country profile to console (too)
25
+ buf << text
26
+ end
27
+ puts "count: #{@pages.count}"
28
+ buf ## return buffered almanac text
29
+ end
30
+
31
+
32
+ class PageCtx
33
+ attr_accessor :page
34
+
35
+ def initialize(page, template)
36
+ @page = page
37
+ @template = template
38
+ end
39
+
40
+ ##############################
41
+ ## add some "view helpers"
42
+
43
+ def name
44
+ ## -- calculate name (use long name if (short) name is not availabe e.g. none)
45
+ ## e.g. Austria
46
+ if @name.nil?
47
+ @name = page.name
48
+ @name = page.name_long if @name == 'none'
49
+ end
50
+ @name
51
+ end
52
+
53
+ def names( separator: ' • ' )
54
+ ## e.g. Austria • Österreich
55
+ if @names.nil?
56
+ if page.name_local.blank? || page.name_local == 'none' || page.name_local == name
57
+ @names = [name] ## no local (in its own non-english language) name
58
+ else
59
+ @names = [name, page.name_local]
60
+ end
61
+ end
62
+ @names.join( separator )
63
+ end
64
+
65
+ def render
66
+ ERB.new( @template).result( binding )
67
+ end
68
+ end ## PageCtx
69
+
70
+ end ## Almanac
71
+
72
+ end # module Factbook
@@ -0,0 +1,74 @@
1
+ # encoding: utf-8
2
+
3
+
4
+ module Factbook
5
+
6
+ class Attributes
7
+
8
+ Attribute = Struct.new( :name,
9
+ :category, ## e.g. Introduction, Geography, etc.
10
+ :path, ## note: is an array e.g. ["Area - comparative"] or ["Area", "land"] etc.
11
+ )
12
+
13
+ def self.from_yaml( path )
14
+
15
+ h = YAML.load_file( path )
16
+ pp h
17
+
18
+ attribs = []
19
+
20
+ ## note: use a copy (e.g. h.dup) for now (hash gets changed by build_attribs!!)
21
+ new_h = h.dup
22
+ new_h.each do |k,v|
23
+ category = k
24
+ build_attribs( attribs, category, [], v )
25
+ end
26
+
27
+ self.new( attribs )
28
+ end
29
+
30
+
31
+ def self.build_attribs( attribs, category, path, h )
32
+
33
+ ## assume it's an attribute definition hash
34
+ ## note: !! exclude special cases:
35
+ ## Capital -- incl. name key itself
36
+ ## National anthem
37
+ if h.has_key?( 'name' ) && ['Capital','National anthem'].include?( path[-1] ) == false
38
+ a = Attribute.new
39
+ a.name = h['name']
40
+ a.category = category
41
+ a.path = path
42
+
43
+ puts " adding attribute >#{a.name}< using #{a.category} / #{a.path.inspect}"
44
+ attribs << a
45
+
46
+ ## note: make sure a modifable copy (of h) gets passed in
47
+ h.delete( 'name' )
48
+ end
49
+
50
+ return if h.empty? ## empty hash; nothing (more) to do; return
51
+
52
+ ## continue walking (recursive)
53
+ h.each do |k,v|
54
+ new_path = path.dup << k ## note: create a new array (copy)
55
+ build_attribs( attribs, category, new_path, v )
56
+ end
57
+ end
58
+
59
+
60
+ def initialize( attribs )
61
+ @attribs = attribs
62
+ end
63
+
64
+ def to_a() @attribs; end
65
+ def size() @attribs.size; end
66
+
67
+ def each
68
+ @attribs.each { |attrib| yield( attrib ) }
69
+ end
70
+
71
+ end # class Attributes
72
+
73
+ end # module Factbook
74
+
@@ -29,7 +29,7 @@ end
29
29
  attr_reader :html_ascii, ## full "original" 1:1 page in "original/ascii8/binary" encoding
30
30
  :html, ## utf-8 encoded profile
31
31
  :html_debug, ## html w/ mapping markers - rename to html_markers - why? why not?
32
- :page_info, ## incl. country_name, region_name, last_updated etc.
32
+ :info, ## page info incl. country_name, region_name, last_updated etc.
33
33
  :errors, ## encoding erros etc.
34
34
  :sects
35
35
 
@@ -38,7 +38,7 @@ def initialize( html_ascii )
38
38
  @html_ascii = html_ascii
39
39
 
40
40
  ## todo/fix: use/assume windows 12xx?? encoding - change encoding to utf-8 (from binary/ascii8bit)
41
- @html, @page_info, @errors = Sanitizer.new.sanitize( @html_ascii )
41
+ @html, @info, @errors = Sanitizer.new.sanitize( @html_ascii )
42
42
 
43
43
  @html_debug = map_sects( @html )
44
44
  @html_debug = map_subsects( @html_debug )
@@ -4,6 +4,7 @@ module Factbook
4
4
 
5
5
  class ItemBuilder ## renameto ItemReader, ItemParser - why? why not??
6
6
  include LogUtils::Logging
7
+ include NormalizeHelper ## e.g. normalize_category
7
8
 
8
9
  def initialize( html, name )
9
10
  @html = html
@@ -42,7 +43,7 @@ def read
42
43
  last_node['text'] += " #{text}" ## append w/o separator
43
44
  end
44
45
  else
45
- if @name == 'demographic_profile' || @name == 'Demographic profile' ## special case (use space a sep)
46
+ if @name == 'Demographic profile' ## special case (use space a sep)
46
47
  last_node['text'] += " #{text}" ## append without (w/o) separator
47
48
  else
48
49
  last_node['text'] += " ++ #{text}" ## append with ++ separator
@@ -60,14 +61,11 @@ def read
60
61
  ## pp spans
61
62
 
62
63
  span_key = spans[0] ## assume 1st entry is span.category
63
- span_value = spans[1] ## assume 2nd entry is span.category_data')
64
- ## allow optional category_data for now
65
- key = span_key.text
66
-
67
- key = key.strip
68
- key = key.sub( /:\z/, '' ) # remove trailing : if present
69
- key = key.strip
64
+ span_value = spans[1] ## assume 2nd entry is span.category_data
65
+
66
+ key = normalize_category( span_key.text )
70
67
 
68
+ ## note: allow optional category_data for now
71
69
  value = span_value ? span_value.text : nil
72
70
 
73
71
  puts "key: >#{key}<, value: >#{value}< : #{value.class.name}"
@@ -87,6 +85,7 @@ def read
87
85
  pp data
88
86
  data
89
87
  end
88
+
90
89
 
91
90
  end # class ItemBuilder
92
91
 
@@ -0,0 +1,79 @@
1
+ # encoding: utf-8
2
+
3
+ module Factbook
4
+
5
+ ######
6
+ # json builder -- lets us rebuild a page from "dumped" json (instead of parsing html page)
7
+
8
+ class JsonBuilder
9
+ include LogUtils::Logging
10
+ include NormalizeHelper ## e.g. normalize_category
11
+
12
+
13
+ def self.from_file( path )
14
+ text = File.read( path ) ## fix: use File.read_utf8 from textutils
15
+ self.from_string( text )
16
+ end
17
+
18
+ def self.from_string( text )
19
+ self.new( text )
20
+ end
21
+
22
+
23
+ attr_reader :text,
24
+ :json,
25
+ :info, ## not used yet -- page info incl. country_name, region_name, last_updated etc.
26
+ :errors, ## not used yet -- encoding erros etc.
27
+ :sects
28
+
29
+
30
+ def initialize( text )
31
+ @text = text
32
+
33
+ @json = JSON.parse( text )
34
+
35
+ @info = nil ## fix/todo: sorry - for now no page info (use header in json - why? why not??)
36
+ @errors = [] ## fix/todo: sorry - for now no errors possible/tracked
37
+
38
+ @sects = []
39
+
40
+ @json.each do |k1,v1|
41
+ sect_title = k1
42
+ sect_subsects = v1
43
+
44
+ sect = Sect.new
45
+ sect.title = sect_title
46
+
47
+ ## get subsections
48
+ subsects = []
49
+ sect_subsects.each do |k2,v2|
50
+ subsect_title = k2
51
+ subsect_data = v2
52
+
53
+ subsect = Subsect.new
54
+ subsect.title = subsect_title
55
+
56
+ #####
57
+ ## note: run data hash through normalize_category (again)
58
+ if subsect_data.is_a?( Hash )
59
+ new_subsect_data = {}
60
+ subsect_data.each do |k3,v3|
61
+ new_subsect_data[ normalize_category(k3) ] = v3
62
+ end
63
+ subsect_data = new_subsect_data
64
+ end
65
+
66
+ subsect.data = subsect_data
67
+
68
+ subsects << subsect
69
+ end
70
+
71
+ sect.subsects = subsects
72
+ @sects << sect
73
+ end
74
+ end
75
+
76
+ end # class JsonBuilder
77
+
78
+
79
+ end # module Factbook
@@ -0,0 +1,48 @@
1
+ # encoding: utf-8
2
+
3
+ module Factbook
4
+
5
+ class Counter
6
+
7
+ attr_reader :data
8
+
9
+ def initialize
10
+ @data = {}
11
+ end
12
+
13
+ def count( page )
14
+
15
+ ## walk page data hash
16
+ # add nodes to data
17
+
18
+ walk( page, page.data, @data )
19
+ end
20
+
21
+
22
+ private
23
+ def walk( page, hin, hout )
24
+ hin.each do |k,v|
25
+ if v.is_a? Hash
26
+ hout2 = hout[k] || { count: 0, codes: '' }
27
+
28
+ hout2[ :count ] += 1
29
+
30
+ ## delete codes if larger (treshhold) than x (e.g. 9)
31
+ hout2.delete( :codes ) if hout2[ :count ] > 9
32
+
33
+ codes = hout2[ :codes ]
34
+ if codes ## note: might got deleted if passed treshhold (e.g. 9 entries)
35
+ codes << ' ' unless codes.empty? ## add separator (space for now)
36
+ codes << page.info.country_code
37
+ hout2[ :codes ] = codes
38
+ end
39
+
40
+ hout[k] = hout2
41
+ walk( page, v, hout2 )
42
+ end
43
+ end
44
+ end
45
+
46
+ end # class Counter
47
+
48
+ end # module Factbook
@@ -0,0 +1,43 @@
1
+ # encoding: utf-8
2
+
3
+ module Factbook
4
+ module NormalizeHelper
5
+
6
+
7
+ def normalize_category( text )
8
+
9
+ ## note: fix typos/errors with double colons e.g. note:: (instead of note:)
10
+
11
+ text = text.strip
12
+ text = text.sub( /:+\z/, '' ) # remove trailing : if present -- note: allow (fix) note:: too, thus, use :+
13
+ text = text.strip
14
+
15
+ #######################################
16
+ ### special cases
17
+
18
+ ## typos e.g ntoe => use note
19
+ text = 'note' if text == 'ntoe'
20
+ text = 'investment in fixed capital' if text == 'investment if fixed capital'
21
+
22
+ ## downcase
23
+ text = 'lowest point' if text == 'Lowest point'
24
+ text = 'chief of state' if text == 'Chief of state'
25
+
26
+ ## spelling variant (use more popular one)
27
+ text = 'signed, but not ratified' if text == 'signed but not ratified'
28
+ text = 'vectorborne diseases' if text == 'vectorborne disease'
29
+ text = 'water contact disease' if text == 'water contact diseases'
30
+ text = 'food or waterborne diseases' if text == 'food or waterborne disease'
31
+ text = 'geographic coordinates' if text == 'geographical coordinates'
32
+ text = 'note' if text == 'notes'
33
+ text = 'refugees (country of origin)' if text == 'refugees (countries of origin)'
34
+
35
+ ## border countries (8): -- remove (x) counter
36
+ text = 'border countries' if text.start_with?( 'border countries')
37
+
38
+ text
39
+ end
40
+
41
+
42
+ end # module NormalizeHelper
43
+ end # module Factbook
@@ -39,18 +39,31 @@ class Page
39
39
  def initialize( code, opts={} )
40
40
  ### keep code - why? why not?? (use page_info/info e.g. info.country_code??)
41
41
 
42
- if opts[:html] ## note: expects ASCII-7BIT/BINARY encoding
43
- ## for debugging and testing allow "custom" passed-in html page
44
- html = opts[:html]
45
- else
46
- url_string = SITE_BASE.gsub( '{code}', code )
47
- ## note: expects ASCII-7BIT/BINARY encoding
48
- html = fetch_page( url_string ) ## use PageFetcher class - why?? why not??
42
+ if opts[:json]
43
+ json = opts[:json] ## note: json is (still) a string/text (NOT yet parsed to structured data)
44
+ b = JsonBuilder.from_string( json )
45
+ else ## assume html
46
+ if opts[:html] ## note: expects ASCII-7BIT/BINARY encoding
47
+ ## for debugging and testing allow "custom" passed-in html page
48
+ html = opts[:html]
49
+ else
50
+ url_string = SITE_BASE.gsub( '{code}', code )
51
+ ## note: expects ASCII-7BIT/BINARY encoding
52
+ html = fetch_page( url_string ) ## use PageFetcher class - why?? why not??
53
+ end
54
+ b = Builder.from_string( html )
49
55
  end
50
56
 
51
- b = Builder.from_string( html )
52
57
  @sects = b.sects
53
- @info = b.page_info ## todo: change b.page_info to info too - why? why not??
58
+ @info = b.info
59
+
60
+ ## todo/fix/quick hack:
61
+ ## check for info opts hash entry - lets you overwrite page info
62
+ ## -- use proper header to setup page info - why, why not??
63
+ if opts[:info]
64
+ info = opts[:info]
65
+ @info = info
66
+ end
54
67
 
55
68
  @data = {}
56
69
  @sects.each do |sect|
@@ -83,43 +96,22 @@ class Page
83
96
  end
84
97
 
85
98
  ## add convenience (shortcut) accessors / attributes / fields / getters
86
-
87
- ATTRIBUTES = {
88
- 'Introduction' => [[:background, 'Background' ]],
89
- 'Geography' => [[:area, 'Area', 'total'], ## convert to number -- why? why not??
90
- [:area_land, 'Area', 'land' ],
91
- [:area_water, 'Area', 'water'],
92
- [:area_note, 'Area', 'note' ],
93
- [:area_comparative, 'Area - comparative'],
94
- [:climate, 'Climate'],
95
- [:terrain, 'Terrain'],
96
- [:elevation_lowest, 'Elevation extremes', 'lowest point'],
97
- [:elevation_highest,'Elevation extremes', 'highest point'],
98
- [:resources, 'Natural resources']],
99
- 'People and Society' => [[:languages, 'Languages' ],
100
- [:religions, 'Religions' ],
101
- [:population, 'Population' ],
102
- [:population_growth, 'Population growth rate' ],
103
- [:birth_rate, 'Birth rate' ],
104
- [:death_rate, 'Death rate' ],
105
- [:migration_rate, 'Net migration rate' ],
106
- [:major_cities, 'Major urban areas - population' ]],
107
- }
108
99
 
109
- ATTRIBUTES.each do |section_title, attribs|
110
- attribs.each do |attrib|
111
- ## e.g.
112
- ## def background() data['Introduction']['Background']['text']; end
113
- ## def location() data['Geography']['Location']['text']; end
114
- ## etc.
115
- if attrib.size == 2
116
- define_method attrib[0] do
117
- @data.fetch( section_title, {} ).fetch( attrib[1], {} )['text']
118
- end
119
- else ## assume size 3 for now
120
- define_method attrib[0] do
121
- @data.fetch( section_title, {} ).fetch( attrib[1], {} ).fetch( attrib[2], {} )['text']
122
- end
100
+ ATTRIBUTES.each do |attrib|
101
+ ## e.g.
102
+ ## def background() data['Introduction']['Background']['text']; end
103
+ ## def location() data['Geography']['Location']['text']; end
104
+ ## etc.
105
+ if attrib.path.size == 1
106
+ define_method attrib.name.to_sym do
107
+ @data.fetch( attrib.category, {} ).
108
+ fetch( attrib.path[0], {} )['text']
109
+ end
110
+ else ## assume size 2 for now
111
+ define_method attrib.name.to_sym do
112
+ @data.fetch( attrib.category, {} ).
113
+ fetch( attrib.path[0], {} ).
114
+ fetch( attrib.path[1], {} )['text']
123
115
  end
124
116
  end
125
117
  end