factbook 1.1.1 → 2.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (64) hide show
  1. checksums.yaml +4 -4
  2. data/{HISTORY.md → CHANGELOG.md} +3 -3
  3. data/Manifest.txt +1 -58
  4. data/README.md +50 -575
  5. data/Rakefile +29 -33
  6. data/lib/factbook.rb +8 -75
  7. metadata +20 -114
  8. data/data/attributes.yml +0 -337
  9. data/data/categories.csv +0 -164
  10. data/data/codes.csv +0 -262
  11. data/data/codesxref.csv +0 -280
  12. data/data/comparisons.csv +0 -75
  13. data/lib/factbook/almanac.rb +0 -72
  14. data/lib/factbook/attributes.rb +0 -74
  15. data/lib/factbook/builder.rb +0 -214
  16. data/lib/factbook/builder_item.rb +0 -92
  17. data/lib/factbook/builder_json.rb +0 -79
  18. data/lib/factbook/codes.rb +0 -119
  19. data/lib/factbook/comparisons.rb +0 -50
  20. data/lib/factbook/counter.rb +0 -48
  21. data/lib/factbook/db/importer.rb +0 -92
  22. data/lib/factbook/db/models.rb +0 -11
  23. data/lib/factbook/db/schema.rb +0 -36
  24. data/lib/factbook/normalize.rb +0 -43
  25. data/lib/factbook/page.rb +0 -185
  26. data/lib/factbook/page_info.rb +0 -12
  27. data/lib/factbook/reader_json.rb +0 -51
  28. data/lib/factbook/sanitizer.rb +0 -207
  29. data/lib/factbook/sect.rb +0 -29
  30. data/lib/factbook/subsect.rb +0 -18
  31. data/lib/factbook/table.rb +0 -52
  32. data/lib/factbook/utils.rb +0 -85
  33. data/lib/factbook/utils_info.rb +0 -102
  34. data/lib/factbook/version.rb +0 -22
  35. data/script/almanac.rb +0 -48
  36. data/script/attributes.rb +0 -34
  37. data/script/build.rb +0 -28
  38. data/script/counter.rb +0 -145
  39. data/script/json.rb +0 -18
  40. data/script/testbr.rb +0 -33
  41. data/script/testcodes.rb +0 -11
  42. data/test/data/au.html +0 -579
  43. data/test/data/au.yml +0 -8
  44. data/test/data/be.html +0 -596
  45. data/test/data/be.yml +0 -8
  46. data/test/data/json/au.json +0 -892
  47. data/test/data/src/au.html +0 -2006
  48. data/test/data/src/be.html +0 -2011
  49. data/test/helper.rb +0 -11
  50. data/test/test_attribs.rb +0 -82
  51. data/test/test_attribs_def.rb +0 -20
  52. data/test/test_builder.rb +0 -35
  53. data/test/test_codes.rb +0 -76
  54. data/test/test_comparisons.rb +0 -19
  55. data/test/test_convert.rb +0 -30
  56. data/test/test_counter.rb +0 -31
  57. data/test/test_fields.rb +0 -52
  58. data/test/test_importer.rb +0 -55
  59. data/test/test_item_builder.rb +0 -99
  60. data/test/test_json.rb +0 -44
  61. data/test/test_json_builder.rb +0 -25
  62. data/test/test_normalize.rb +0 -23
  63. data/test/test_page.rb +0 -38
  64. data/test/test_sanitizer.rb +0 -35
@@ -1,75 +0,0 @@
1
- Num,Category,Name
2
- 2147,Geography,Area
3
- 2119,People and Society,Population
4
- 2002,People and Society,Population growth rate
5
- 2054,People and Society,Birth rate
6
- 2066,People and Society,Death rate
7
- 2112,People and Society,Net migration rate
8
- 2223,People and Society,Maternal mortality rate
9
- 2091,People and Society,Infant mortality rate
10
- 2102,People and Society,Life expectancy at birth
11
- 2127,People and Society,Total fertility rate
12
- 2225,People and Society,Health expenditures
13
- 2155,People and Society,HIV/AIDS - adult prevalence rate
14
- 2156,People and Society,HIV/AIDS - people living with HIV/AIDS
15
- 2157,People and Society,HIV/AIDS - deaths
16
- 2228,People and Society,Obesity - adult prevalence rate
17
- 2224,People and Society,Children under the age of 5 years underweight
18
- 2206,People and Society,Education expenditures
19
- 2229,People and Society,"Unemployment, youth ages 15-24"
20
- 2001,Economy,GDP (purchasing power parity)
21
- 2003,Economy,GDP - real growth rate
22
- 2004,Economy,GDP - per capita (PPP)
23
- 2260,Economy,Gross national saving
24
- 2089,Economy,Industrial production growth rate
25
- 2095,Economy,Labor force
26
- 2129,Economy,Unemployment rate
27
- 2172,Economy,Distribution of family income - Gini index
28
- 2221,Economy,Taxes and other revenues
29
- 2222,Economy,Budget surplus (+) or deficit (-)
30
- 2186,Economy,Public debt
31
- 2092,Economy,Inflation rate (consumer prices)
32
- 2207,Economy,Central bank discount rate
33
- 2208,Economy,Commercial bank prime lending rate
34
- 2214,Economy,Stock of narrow money
35
- 2215,Economy,Stock of broad money
36
- 2211,Economy,Stock of domestic credit
37
- 2200,Economy,Market value of publicly traded shares
38
- 2187,Economy,Current account balance
39
- 2078,Economy,Exports
40
- 2087,Economy,Imports
41
- 2188,Economy,Reserves of foreign exchange and gold
42
- 2079,Economy,Debt - external
43
- 2198,Economy,Stock of direct foreign investment - at home
44
- 2199,Economy,Stock of direct foreign investment - abroad
45
- 2232,Energy,Electricity - production
46
- 2233,Energy,Electricity - consumption
47
- 2234,Energy,Electricity - exports
48
- 2235,Energy,Electricity - imports
49
- 2236,Energy,Electricity - installed generating capacity
50
- 2237,Energy,Electricity - from fossil fuels
51
- 2239,Energy,Electricity - from nuclear fuels
52
- 2238,Energy,Electricity - from hydroelectric plants
53
- 2240,Energy,Electricity - from other renewable sources
54
- 2241,Energy,Crude oil - production
55
- 2242,Energy,Crude oil - exports
56
- 2243,Energy,Crude oil - imports
57
- 2244,Energy,Crude oil - proved reserves
58
- 2245,Energy,Refined petroleum products - production
59
- 2246,Energy,Refined petroleum products - consumption
60
- 2247,Energy,Refined petroleum products - exports
61
- 2248,Energy,Refined petroleum products - imports
62
- 2249,Energy,Natural gas - production
63
- 2250,Energy,Natural gas - consumption
64
- 2251,Energy,Natural gas - exports
65
- 2252,Energy,Natural gas - imports
66
- 2253,Energy,Natural gas - proved reserves
67
- 2150,Communications,Telephones - fixed lines
68
- 2151,Communications,Telephones - mobile cellular
69
- 2153,Communications,Internet users
70
- 2053,Transportation,Airports
71
- 2121,Transportation,Railways
72
- 2085,Transportation,Roadways
73
- 2093,Transportation,Waterways
74
- 2108,Transportation,Merchant marine
75
- 2034,Military,Military expenditures
@@ -1,72 +0,0 @@
1
- # encoding: utf-8
2
-
3
- module Factbook
4
-
5
-
6
- class Almanac
7
-
8
- ## convenience helper ("factory")
9
- def self.from_json( codes, json_dir: '.' )
10
- pages = JsonPageReader.new( json_dir ).read_pages( codes )
11
- self.new( pages )
12
- end
13
-
14
-
15
- def initialize( pages )
16
- @pages = pages
17
- end
18
-
19
- def render( template )
20
- buf = ''
21
- @pages.each do |page|
22
- text = PageCtx.new( page, template ).render
23
-
24
- puts text ## for debugging write country profile to console (too)
25
- buf << text
26
- end
27
- puts "count: #{@pages.count}"
28
- buf ## return buffered almanac text
29
- end
30
-
31
-
32
- class PageCtx
33
- attr_accessor :page
34
-
35
- def initialize(page, template)
36
- @page = page
37
- @template = template
38
- end
39
-
40
- ##############################
41
- ## add some "view helpers"
42
-
43
- def name
44
- ## -- calculate name (use long name if (short) name is not availabe e.g. none)
45
- ## e.g. Austria
46
- if @name.nil?
47
- @name = page.name
48
- @name = page.name_long if @name == 'none'
49
- end
50
- @name
51
- end
52
-
53
- def names( separator: ' • ' )
54
- ## e.g. Austria • Österreich
55
- if @names.nil?
56
- if page.name_local.blank? || page.name_local == 'none' || page.name_local == name
57
- @names = [name] ## no local (in its own non-english language) name
58
- else
59
- @names = [name, page.name_local]
60
- end
61
- end
62
- @names.join( separator )
63
- end
64
-
65
- def render
66
- ERB.new( @template).result( binding )
67
- end
68
- end ## PageCtx
69
-
70
- end ## Almanac
71
-
72
- end # module Factbook
@@ -1,74 +0,0 @@
1
- # encoding: utf-8
2
-
3
-
4
- module Factbook
5
-
6
- class Attributes
7
-
8
- Attribute = Struct.new( :name,
9
- :category, ## e.g. Introduction, Geography, etc.
10
- :path, ## note: is an array e.g. ["Area - comparative"] or ["Area", "land"] etc.
11
- )
12
-
13
- def self.from_yaml( path )
14
-
15
- h = YAML.load_file( path )
16
- pp h
17
-
18
- attribs = []
19
-
20
- ## note: use a copy (e.g. h.dup) for now (hash gets changed by build_attribs!!)
21
- new_h = h.dup
22
- new_h.each do |k,v|
23
- category = k
24
- build_attribs( attribs, category, [], v )
25
- end
26
-
27
- self.new( attribs )
28
- end
29
-
30
-
31
- def self.build_attribs( attribs, category, path, h )
32
-
33
- ## assume it's an attribute definition hash
34
- ## note: !! exclude special cases:
35
- ## Capital -- incl. name key itself
36
- ## National anthem
37
- if h.has_key?( 'name' ) && ['Capital','National anthem'].include?( path[-1] ) == false
38
- a = Attribute.new
39
- a.name = h['name']
40
- a.category = category
41
- a.path = path
42
-
43
- puts " adding attribute >#{a.name}< using #{a.category} / #{a.path.inspect}"
44
- attribs << a
45
-
46
- ## note: make sure a modifable copy (of h) gets passed in
47
- h.delete( 'name' )
48
- end
49
-
50
- return if h.empty? ## empty hash; nothing (more) to do; return
51
-
52
- ## continue walking (recursive)
53
- h.each do |k,v|
54
- new_path = path.dup << k ## note: create a new array (copy)
55
- build_attribs( attribs, category, new_path, v )
56
- end
57
- end
58
-
59
-
60
- def initialize( attribs )
61
- @attribs = attribs
62
- end
63
-
64
- def to_a() @attribs; end
65
- def size() @attribs.size; end
66
-
67
- def each
68
- @attribs.each { |attrib| yield( attrib ) }
69
- end
70
-
71
- end # class Attributes
72
-
73
- end # module Factbook
74
-
@@ -1,214 +0,0 @@
1
- # encoding: utf-8
2
-
3
- module Factbook
4
-
5
- class Builder ## todo: change to PageBuilder ???
6
- include LogUtils::Logging
7
-
8
-
9
- =begin
10
- def self.from_cc( cc, opts={} ) ## rename to from_file_for_country() or from_file_for_cc() or something - why?? why not??
11
- ## check/todo: rename input_dir to just dir or to include ?
12
- ## (there's no output_dir)?? - why? why not?
13
- input_dir = opts[:input_dir] || '.'
14
- self.from_file( "#{input_dir}/#{cc}.html" )
15
- end
16
- =end
17
-
18
-
19
- def self.from_file( path )
20
- html_ascii = File.read( path ) ## fix/todo: use ASCII8BIT/binary reader !!!!!
21
- self.from_string( html_ascii )
22
- end
23
-
24
- def self.from_string( html_ascii ) ## note: expects ASCII-7BIT/BINARY encoding
25
- self.new( html_ascii )
26
- end
27
-
28
-
29
- attr_reader :html_ascii, ## full "original" 1:1 page in "original/ascii8/binary" encoding
30
- :html, ## utf-8 encoded profile
31
- :html_debug, ## html w/ mapping markers - rename to html_markers - why? why not?
32
- :info, ## page info incl. country_name, region_name, last_updated etc.
33
- :errors, ## encoding erros etc.
34
- :sects
35
-
36
-
37
- def initialize( html_ascii )
38
- @html_ascii = html_ascii
39
-
40
- ## todo/fix: use/assume windows 12xx?? encoding - change encoding to utf-8 (from binary/ascii8bit)
41
- @html, @info, @errors = Sanitizer.new.sanitize( @html_ascii )
42
-
43
- @html_debug = map_sects( @html )
44
- @html_debug = map_subsects( @html_debug )
45
-
46
- html_sects = split_sects( @html_debug )
47
- pp html_sects
48
-
49
-
50
- @sects = []
51
- html_sects.each do |html_sect|
52
- html_sect_head = html_sect[0]
53
- html_subsects = html_sect[1]
54
- puts html_sect_head
55
- puts html_subsects.size
56
-
57
- ## get section title
58
- ## @SECTION{Economy} => Economy
59
- if html_sect_head =~ /@SECTION{(.+?)}/
60
- title = $1.strip
61
- puts title
62
- sect = Sect.new
63
- sect.title = title
64
- ## get subsections
65
- subsects = []
66
- html_subsects.each do |html_subsect|
67
- html_subsect_head = html_subsect[0]
68
- html_subsect_body = html_subsect[1]
69
- if html_subsect_head =~ /@SUBSECTION{(.+?)}/
70
- title = $1.strip
71
- title = title.sub( /:\z/, '' ) # remove trailing : if present
72
- title = title.strip
73
-
74
- puts title
75
- subsect = Subsect.new
76
- subsect.title = title ## todo/fix: cut off trailing colon (:)
77
-
78
- b = Factbook::ItemBuilder.new( html_subsect_body, title )
79
- h = b.read
80
- subsect.data = h
81
-
82
- subsects << subsect
83
- else
84
- ## warn/fix: no subsection title found
85
- end
86
- end
87
- sect.subsects = subsects
88
- @sects << sect
89
- else
90
- ## warn/fix: no section title found
91
- end
92
- end
93
-
94
- self ## return self -- needed?? default (standard) anyway?? check and remove
95
- end
96
-
97
-
98
-
99
- def map_sects( html )
100
- ## convert section titles
101
- ## from <h2>..</h2>
102
- ## to "unified" marker
103
-
104
- ## e.g.
105
- ## <h2 sectiontitle='Introduction' ccode='au'>Introduction :: <span class='region'>AUSTRIA </span></h2>
106
- ## <h2>Introduction</h2>
107
-
108
- title_regex= /<h2
109
- (?:\s[^>]+)? ## allow optional attributes in h2
110
- >
111
- \s*
112
- ([^<>]+?) ## note: use non-greedy; do NOT allow tags inside for now
113
- \s*
114
- (?:\s::\s
115
- .+? ## note: use non-greedy; allows tags inside
116
- )? ## strip optional name (e.g. :: AUSTRIA)
117
- <\/h2>
118
- /xim
119
-
120
- html = html.gsub( title_regex ) do |m|
121
- puts "** found section >#{$1}<:"
122
- puts " >|#{m}|<"
123
-
124
- "\n\n@SECTION{#{$1}}\n\n"
125
- end
126
- html
127
- end
128
-
129
-
130
- def map_subsects( html )
131
- ## convert subsection titles
132
- ## from <div id='field'>..</div>
133
- ## to "unified" marker
134
-
135
- ## e.g.
136
- ## <div id='field' class='category'>Disputes - international:</div>
137
-
138
- title_regex= /<div \s id='field'
139
- \s class='category'>
140
- \s*
141
- (.+?) ## note: use non-greedy; allows tags inside - why? why not
142
- \s*
143
- <\/div>
144
- /xim
145
-
146
- html = html.gsub( title_regex ) do |m|
147
- puts "** found subsection >#{$1}<:"
148
- puts " >|#{m}|<"
149
-
150
- "\n@SUBSECTION{#{$1}}\n"
151
- end
152
- html
153
- end
154
-
155
-
156
-
157
- def split_sects( html )
158
- ####
159
- # split html in sections (divided by section headings)
160
- # e.g. remove optional prolog ??,
161
- ## [[heading,sect],
162
- ## [heading,sect],
163
- ## [heading,sect],...]
164
-
165
- ## note: "wrap" regex in a capture group (just one)
166
- ## String#split will include all catpure groups in the result array
167
-
168
- section_regex= /(@SECTION{.+?})/ ## note: use non-greedy -- check: need to escape {} ??
169
-
170
- chunks = html.split( section_regex )
171
-
172
- ## check if first item is a section or (html) prolog
173
- # if prolog (remove)
174
- chunks.slice!(0) unless chunks[0] =~ /@SECTION/ ## starts w/ @SECTION
175
-
176
- pairs = chunks.each_slice(2).to_a
177
-
178
- ## now split subsections
179
- newpairs = []
180
- pairs.each do |item|
181
- ## todo: after cleanup prolog; remove @SECTION{} ?? - just keep title - why, why not??
182
- newpairs << [item[0], split_subsects( item[1]) ]
183
- end
184
- newpairs
185
- end
186
-
187
-
188
- def split_subsects( html )
189
- ####
190
- # split html in subsections (divided by subsection headings)
191
- # e.g. remove optional prolog ??,
192
- ## [[heading,sect],
193
- ## [heading,sect],
194
- ## [heading,sect],...]
195
-
196
- ## note: "wrap" regex in a capture group (just one)
197
- ## String#split will include all catpure groups in the result array
198
-
199
- subsection_regex= /(@SUBSECTION{.+?})/ ## note: use non-greedy -- check: need to escape {} ??
200
-
201
- chunks = html.split( subsection_regex )
202
-
203
- ## check if first item is a section or (html) prolog
204
- # if prolog (remove)
205
- chunks.slice!(0) unless chunks[0] =~ /@SUBSECTION/ ## starts w/ @SUBSECTION
206
-
207
- pairs = chunks.each_slice(2).to_a
208
- pairs
209
- end
210
-
211
- end # class Builder
212
-
213
-
214
- end # module Factbook
@@ -1,92 +0,0 @@
1
- # encoding: utf-8
2
-
3
- module Factbook
4
-
5
- class ItemBuilder ## renameto ItemReader, ItemParser - why? why not??
6
- include LogUtils::Logging
7
- include NormalizeHelper ## e.g. normalize_category
8
-
9
- def initialize( html, name )
10
- @html = html
11
- @name = name # add category/field name e.g. Area, Location, etc.
12
- end
13
-
14
- def read
15
- ## return hash from html snippet
16
- doc = Nokogiri::HTML.fragment( @html )
17
-
18
- data = {}
19
- last_node = nil ## track last hash (always use text key)
20
- last_node_data_count = 0
21
-
22
- ## note:
23
- ## skip whitespace text nodes (e.g. \n\n etc); just use divs
24
- doc.children.filter('div').each_with_index do |child,i|
25
-
26
- if child['class'] == 'category_data'
27
- text = child.text ## fix/todo: use strip
28
- puts "category_data: >#{text}<"
29
-
30
- if last_node.nil?
31
- ## assume its the very first entry; use implied/auto-created category
32
- data['text'] = ''
33
- last_node = data
34
- last_node_data_count = 0
35
- end
36
-
37
- ### first category_data element?
38
- if last_node_data_count == 0
39
- if last_node['text'] == ''
40
- last_node['text'] = text
41
- else ### possible ??? if data_count is zero - not should not include any data
42
- ## todo: issue warning here - why? why not??
43
- last_node['text'] += " #{text}" ## append w/o separator
44
- end
45
- else
46
- if @name == 'Demographic profile' ## special case (use space a sep)
47
- last_node['text'] += " #{text}" ## append without (w/o) separator
48
- else
49
- last_node['text'] += " ++ #{text}" ## append with ++ separator
50
- end
51
- end
52
- last_node_data_count += 1
53
-
54
- elsif child['class'].nil? ## div without any class e.g. <div>..</div>
55
- ## assume category and category_data pair w/ spans
56
- spans = child.children.filter('span')
57
- if spans.size > 2
58
- puts "*** warn: expected two (or one) spans; got #{spans.inspect}"
59
- end
60
-
61
- ## pp spans
62
-
63
- span_key = spans[0] ## assume 1st entry is span.category
64
- span_value = spans[1] ## assume 2nd entry is span.category_data
65
-
66
- key = normalize_category( span_key.text )
67
-
68
- ## note: allow optional category_data for now
69
- value = span_value ? span_value.text : nil
70
-
71
- puts "key: >#{key}<, value: >#{value}< : #{value.class.name}"
72
-
73
- ## start new pair
74
- last_node = data[key] = { 'text' => value }
75
- last_node_data_count = value ? 1 : 0 ## note: set to 1 if value present
76
- else
77
- puts "*** warn: item builder -- unknow css class in #{child.inspect}"
78
- end
79
-
80
- ## pp child
81
- ## css = child['class']
82
- ## puts "[#{i}] #{child.name} class='>#{css}< : #{css.class.name}' >#{child.text}<"
83
- end
84
-
85
- pp data
86
- data
87
- end
88
-
89
-
90
- end # class ItemBuilder
91
-
92
- end # module Factbook