factbook 1.1.1 → 2.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. checksums.yaml +4 -4
  2. data/{HISTORY.md → CHANGELOG.md} +3 -3
  3. data/Manifest.txt +1 -58
  4. data/README.md +50 -575
  5. data/Rakefile +29 -33
  6. data/lib/factbook.rb +8 -75
  7. metadata +20 -114
  8. data/data/attributes.yml +0 -337
  9. data/data/categories.csv +0 -164
  10. data/data/codes.csv +0 -262
  11. data/data/codesxref.csv +0 -280
  12. data/data/comparisons.csv +0 -75
  13. data/lib/factbook/almanac.rb +0 -72
  14. data/lib/factbook/attributes.rb +0 -74
  15. data/lib/factbook/builder.rb +0 -214
  16. data/lib/factbook/builder_item.rb +0 -92
  17. data/lib/factbook/builder_json.rb +0 -79
  18. data/lib/factbook/codes.rb +0 -119
  19. data/lib/factbook/comparisons.rb +0 -50
  20. data/lib/factbook/counter.rb +0 -48
  21. data/lib/factbook/db/importer.rb +0 -92
  22. data/lib/factbook/db/models.rb +0 -11
  23. data/lib/factbook/db/schema.rb +0 -36
  24. data/lib/factbook/normalize.rb +0 -43
  25. data/lib/factbook/page.rb +0 -185
  26. data/lib/factbook/page_info.rb +0 -12
  27. data/lib/factbook/reader_json.rb +0 -51
  28. data/lib/factbook/sanitizer.rb +0 -207
  29. data/lib/factbook/sect.rb +0 -29
  30. data/lib/factbook/subsect.rb +0 -18
  31. data/lib/factbook/table.rb +0 -52
  32. data/lib/factbook/utils.rb +0 -85
  33. data/lib/factbook/utils_info.rb +0 -102
  34. data/lib/factbook/version.rb +0 -22
  35. data/script/almanac.rb +0 -48
  36. data/script/attributes.rb +0 -34
  37. data/script/build.rb +0 -28
  38. data/script/counter.rb +0 -145
  39. data/script/json.rb +0 -18
  40. data/script/testbr.rb +0 -33
  41. data/script/testcodes.rb +0 -11
  42. data/test/data/au.html +0 -579
  43. data/test/data/au.yml +0 -8
  44. data/test/data/be.html +0 -596
  45. data/test/data/be.yml +0 -8
  46. data/test/data/json/au.json +0 -892
  47. data/test/data/src/au.html +0 -2006
  48. data/test/data/src/be.html +0 -2011
  49. data/test/helper.rb +0 -11
  50. data/test/test_attribs.rb +0 -82
  51. data/test/test_attribs_def.rb +0 -20
  52. data/test/test_builder.rb +0 -35
  53. data/test/test_codes.rb +0 -76
  54. data/test/test_comparisons.rb +0 -19
  55. data/test/test_convert.rb +0 -30
  56. data/test/test_counter.rb +0 -31
  57. data/test/test_fields.rb +0 -52
  58. data/test/test_importer.rb +0 -55
  59. data/test/test_item_builder.rb +0 -99
  60. data/test/test_json.rb +0 -44
  61. data/test/test_json_builder.rb +0 -25
  62. data/test/test_normalize.rb +0 -23
  63. data/test/test_page.rb +0 -38
  64. data/test/test_sanitizer.rb +0 -35
@@ -1,75 +0,0 @@
1
- Num,Category,Name
2
- 2147,Geography,Area
3
- 2119,People and Society,Population
4
- 2002,People and Society,Population growth rate
5
- 2054,People and Society,Birth rate
6
- 2066,People and Society,Death rate
7
- 2112,People and Society,Net migration rate
8
- 2223,People and Society,Maternal mortality rate
9
- 2091,People and Society,Infant mortality rate
10
- 2102,People and Society,Life expectancy at birth
11
- 2127,People and Society,Total fertility rate
12
- 2225,People and Society,Health expenditures
13
- 2155,People and Society,HIV/AIDS - adult prevalence rate
14
- 2156,People and Society,HIV/AIDS - people living with HIV/AIDS
15
- 2157,People and Society,HIV/AIDS - deaths
16
- 2228,People and Society,Obesity - adult prevalence rate
17
- 2224,People and Society,Children under the age of 5 years underweight
18
- 2206,People and Society,Education expenditures
19
- 2229,People and Society,"Unemployment, youth ages 15-24"
20
- 2001,Economy,GDP (purchasing power parity)
21
- 2003,Economy,GDP - real growth rate
22
- 2004,Economy,GDP - per capita (PPP)
23
- 2260,Economy,Gross national saving
24
- 2089,Economy,Industrial production growth rate
25
- 2095,Economy,Labor force
26
- 2129,Economy,Unemployment rate
27
- 2172,Economy,Distribution of family income - Gini index
28
- 2221,Economy,Taxes and other revenues
29
- 2222,Economy,Budget surplus (+) or deficit (-)
30
- 2186,Economy,Public debt
31
- 2092,Economy,Inflation rate (consumer prices)
32
- 2207,Economy,Central bank discount rate
33
- 2208,Economy,Commercial bank prime lending rate
34
- 2214,Economy,Stock of narrow money
35
- 2215,Economy,Stock of broad money
36
- 2211,Economy,Stock of domestic credit
37
- 2200,Economy,Market value of publicly traded shares
38
- 2187,Economy,Current account balance
39
- 2078,Economy,Exports
40
- 2087,Economy,Imports
41
- 2188,Economy,Reserves of foreign exchange and gold
42
- 2079,Economy,Debt - external
43
- 2198,Economy,Stock of direct foreign investment - at home
44
- 2199,Economy,Stock of direct foreign investment - abroad
45
- 2232,Energy,Electricity - production
46
- 2233,Energy,Electricity - consumption
47
- 2234,Energy,Electricity - exports
48
- 2235,Energy,Electricity - imports
49
- 2236,Energy,Electricity - installed generating capacity
50
- 2237,Energy,Electricity - from fossil fuels
51
- 2239,Energy,Electricity - from nuclear fuels
52
- 2238,Energy,Electricity - from hydroelectric plants
53
- 2240,Energy,Electricity - from other renewable sources
54
- 2241,Energy,Crude oil - production
55
- 2242,Energy,Crude oil - exports
56
- 2243,Energy,Crude oil - imports
57
- 2244,Energy,Crude oil - proved reserves
58
- 2245,Energy,Refined petroleum products - production
59
- 2246,Energy,Refined petroleum products - consumption
60
- 2247,Energy,Refined petroleum products - exports
61
- 2248,Energy,Refined petroleum products - imports
62
- 2249,Energy,Natural gas - production
63
- 2250,Energy,Natural gas - consumption
64
- 2251,Energy,Natural gas - exports
65
- 2252,Energy,Natural gas - imports
66
- 2253,Energy,Natural gas - proved reserves
67
- 2150,Communications,Telephones - fixed lines
68
- 2151,Communications,Telephones - mobile cellular
69
- 2153,Communications,Internet users
70
- 2053,Transportation,Airports
71
- 2121,Transportation,Railways
72
- 2085,Transportation,Roadways
73
- 2093,Transportation,Waterways
74
- 2108,Transportation,Merchant marine
75
- 2034,Military,Military expenditures
@@ -1,72 +0,0 @@
1
- # encoding: utf-8
2
-
3
- module Factbook
4
-
5
-
6
- class Almanac
7
-
8
- ## convenience helper ("factory")
9
- def self.from_json( codes, json_dir: '.' )
10
- pages = JsonPageReader.new( json_dir ).read_pages( codes )
11
- self.new( pages )
12
- end
13
-
14
-
15
- def initialize( pages )
16
- @pages = pages
17
- end
18
-
19
- def render( template )
20
- buf = ''
21
- @pages.each do |page|
22
- text = PageCtx.new( page, template ).render
23
-
24
- puts text ## for debugging write country profile to console (too)
25
- buf << text
26
- end
27
- puts "count: #{@pages.count}"
28
- buf ## return buffered almanac text
29
- end
30
-
31
-
32
- class PageCtx
33
- attr_accessor :page
34
-
35
- def initialize(page, template)
36
- @page = page
37
- @template = template
38
- end
39
-
40
- ##############################
41
- ## add some "view helpers"
42
-
43
- def name
44
- ## -- calculate name (use long name if (short) name is not availabe e.g. none)
45
- ## e.g. Austria
46
- if @name.nil?
47
- @name = page.name
48
- @name = page.name_long if @name == 'none'
49
- end
50
- @name
51
- end
52
-
53
- def names( separator: ' • ' )
54
- ## e.g. Austria • Österreich
55
- if @names.nil?
56
- if page.name_local.blank? || page.name_local == 'none' || page.name_local == name
57
- @names = [name] ## no local (in its own non-english language) name
58
- else
59
- @names = [name, page.name_local]
60
- end
61
- end
62
- @names.join( separator )
63
- end
64
-
65
- def render
66
- ERB.new( @template).result( binding )
67
- end
68
- end ## PageCtx
69
-
70
- end ## Almanac
71
-
72
- end # module Factbook
@@ -1,74 +0,0 @@
1
- # encoding: utf-8
2
-
3
-
4
- module Factbook
5
-
6
- class Attributes
7
-
8
- Attribute = Struct.new( :name,
9
- :category, ## e.g. Introduction, Geography, etc.
10
- :path, ## note: is an array e.g. ["Area - comparative"] or ["Area", "land"] etc.
11
- )
12
-
13
- def self.from_yaml( path )
14
-
15
- h = YAML.load_file( path )
16
- pp h
17
-
18
- attribs = []
19
-
20
- ## note: use a copy (e.g. h.dup) for now (hash gets changed by build_attribs!!)
21
- new_h = h.dup
22
- new_h.each do |k,v|
23
- category = k
24
- build_attribs( attribs, category, [], v )
25
- end
26
-
27
- self.new( attribs )
28
- end
29
-
30
-
31
- def self.build_attribs( attribs, category, path, h )
32
-
33
- ## assume it's an attribute definition hash
34
- ## note: !! exclude special cases:
35
- ## Capital -- incl. name key itself
36
- ## National anthem
37
- if h.has_key?( 'name' ) && ['Capital','National anthem'].include?( path[-1] ) == false
38
- a = Attribute.new
39
- a.name = h['name']
40
- a.category = category
41
- a.path = path
42
-
43
- puts " adding attribute >#{a.name}< using #{a.category} / #{a.path.inspect}"
44
- attribs << a
45
-
46
- ## note: make sure a modifable copy (of h) gets passed in
47
- h.delete( 'name' )
48
- end
49
-
50
- return if h.empty? ## empty hash; nothing (more) to do; return
51
-
52
- ## continue walking (recursive)
53
- h.each do |k,v|
54
- new_path = path.dup << k ## note: create a new array (copy)
55
- build_attribs( attribs, category, new_path, v )
56
- end
57
- end
58
-
59
-
60
- def initialize( attribs )
61
- @attribs = attribs
62
- end
63
-
64
- def to_a() @attribs; end
65
- def size() @attribs.size; end
66
-
67
- def each
68
- @attribs.each { |attrib| yield( attrib ) }
69
- end
70
-
71
- end # class Attributes
72
-
73
- end # module Factbook
74
-
@@ -1,214 +0,0 @@
1
- # encoding: utf-8
2
-
3
- module Factbook
4
-
5
- class Builder ## todo: change to PageBuilder ???
6
- include LogUtils::Logging
7
-
8
-
9
- =begin
10
- def self.from_cc( cc, opts={} ) ## rename to from_file_for_country() or from_file_for_cc() or something - why?? why not??
11
- ## check/todo: rename input_dir to just dir or to include ?
12
- ## (there's no output_dir)?? - why? why not?
13
- input_dir = opts[:input_dir] || '.'
14
- self.from_file( "#{input_dir}/#{cc}.html" )
15
- end
16
- =end
17
-
18
-
19
- def self.from_file( path )
20
- html_ascii = File.read( path ) ## fix/todo: use ASCII8BIT/binary reader !!!!!
21
- self.from_string( html_ascii )
22
- end
23
-
24
- def self.from_string( html_ascii ) ## note: expects ASCII-7BIT/BINARY encoding
25
- self.new( html_ascii )
26
- end
27
-
28
-
29
- attr_reader :html_ascii, ## full "original" 1:1 page in "original/ascii8/binary" encoding
30
- :html, ## utf-8 encoded profile
31
- :html_debug, ## html w/ mapping markers - rename to html_markers - why? why not?
32
- :info, ## page info incl. country_name, region_name, last_updated etc.
33
- :errors, ## encoding erros etc.
34
- :sects
35
-
36
-
37
- def initialize( html_ascii )
38
- @html_ascii = html_ascii
39
-
40
- ## todo/fix: use/assume windows 12xx?? encoding - change encoding to utf-8 (from binary/ascii8bit)
41
- @html, @info, @errors = Sanitizer.new.sanitize( @html_ascii )
42
-
43
- @html_debug = map_sects( @html )
44
- @html_debug = map_subsects( @html_debug )
45
-
46
- html_sects = split_sects( @html_debug )
47
- pp html_sects
48
-
49
-
50
- @sects = []
51
- html_sects.each do |html_sect|
52
- html_sect_head = html_sect[0]
53
- html_subsects = html_sect[1]
54
- puts html_sect_head
55
- puts html_subsects.size
56
-
57
- ## get section title
58
- ## @SECTION{Economy} => Economy
59
- if html_sect_head =~ /@SECTION{(.+?)}/
60
- title = $1.strip
61
- puts title
62
- sect = Sect.new
63
- sect.title = title
64
- ## get subsections
65
- subsects = []
66
- html_subsects.each do |html_subsect|
67
- html_subsect_head = html_subsect[0]
68
- html_subsect_body = html_subsect[1]
69
- if html_subsect_head =~ /@SUBSECTION{(.+?)}/
70
- title = $1.strip
71
- title = title.sub( /:\z/, '' ) # remove trailing : if present
72
- title = title.strip
73
-
74
- puts title
75
- subsect = Subsect.new
76
- subsect.title = title ## todo/fix: cut off trailing colon (:)
77
-
78
- b = Factbook::ItemBuilder.new( html_subsect_body, title )
79
- h = b.read
80
- subsect.data = h
81
-
82
- subsects << subsect
83
- else
84
- ## warn/fix: no subsection title found
85
- end
86
- end
87
- sect.subsects = subsects
88
- @sects << sect
89
- else
90
- ## warn/fix: no section title found
91
- end
92
- end
93
-
94
- self ## return self -- needed?? default (standard) anyway?? check and remove
95
- end
96
-
97
-
98
-
99
- def map_sects( html )
100
- ## convert section titles
101
- ## from <h2>..</h2>
102
- ## to "unified" marker
103
-
104
- ## e.g.
105
- ## <h2 sectiontitle='Introduction' ccode='au'>Introduction :: <span class='region'>AUSTRIA </span></h2>
106
- ## <h2>Introduction</h2>
107
-
108
- title_regex= /<h2
109
- (?:\s[^>]+)? ## allow optional attributes in h2
110
- >
111
- \s*
112
- ([^<>]+?) ## note: use non-greedy; do NOT allow tags inside for now
113
- \s*
114
- (?:\s::\s
115
- .+? ## note: use non-greedy; allows tags inside
116
- )? ## strip optional name (e.g. :: AUSTRIA)
117
- <\/h2>
118
- /xim
119
-
120
- html = html.gsub( title_regex ) do |m|
121
- puts "** found section >#{$1}<:"
122
- puts " >|#{m}|<"
123
-
124
- "\n\n@SECTION{#{$1}}\n\n"
125
- end
126
- html
127
- end
128
-
129
-
130
- def map_subsects( html )
131
- ## convert subsection titles
132
- ## from <div id='field'>..</div>
133
- ## to "unified" marker
134
-
135
- ## e.g.
136
- ## <div id='field' class='category'>Disputes - international:</div>
137
-
138
- title_regex= /<div \s id='field'
139
- \s class='category'>
140
- \s*
141
- (.+?) ## note: use non-greedy; allows tags inside - why? why not
142
- \s*
143
- <\/div>
144
- /xim
145
-
146
- html = html.gsub( title_regex ) do |m|
147
- puts "** found subsection >#{$1}<:"
148
- puts " >|#{m}|<"
149
-
150
- "\n@SUBSECTION{#{$1}}\n"
151
- end
152
- html
153
- end
154
-
155
-
156
-
157
- def split_sects( html )
158
- ####
159
- # split html in sections (divided by section headings)
160
- # e.g. remove optional prolog ??,
161
- ## [[heading,sect],
162
- ## [heading,sect],
163
- ## [heading,sect],...]
164
-
165
- ## note: "wrap" regex in a capture group (just one)
166
- ## String#split will include all catpure groups in the result array
167
-
168
- section_regex= /(@SECTION{.+?})/ ## note: use non-greedy -- check: need to escape {} ??
169
-
170
- chunks = html.split( section_regex )
171
-
172
- ## check if first item is a section or (html) prolog
173
- # if prolog (remove)
174
- chunks.slice!(0) unless chunks[0] =~ /@SECTION/ ## starts w/ @SECTION
175
-
176
- pairs = chunks.each_slice(2).to_a
177
-
178
- ## now split subsections
179
- newpairs = []
180
- pairs.each do |item|
181
- ## todo: after cleanup prolog; remove @SECTION{} ?? - just keep title - why, why not??
182
- newpairs << [item[0], split_subsects( item[1]) ]
183
- end
184
- newpairs
185
- end
186
-
187
-
188
- def split_subsects( html )
189
- ####
190
- # split html in subsections (divided by subsection headings)
191
- # e.g. remove optional prolog ??,
192
- ## [[heading,sect],
193
- ## [heading,sect],
194
- ## [heading,sect],...]
195
-
196
- ## note: "wrap" regex in a capture group (just one)
197
- ## String#split will include all catpure groups in the result array
198
-
199
- subsection_regex= /(@SUBSECTION{.+?})/ ## note: use non-greedy -- check: need to escape {} ??
200
-
201
- chunks = html.split( subsection_regex )
202
-
203
- ## check if first item is a section or (html) prolog
204
- # if prolog (remove)
205
- chunks.slice!(0) unless chunks[0] =~ /@SUBSECTION/ ## starts w/ @SUBSECTION
206
-
207
- pairs = chunks.each_slice(2).to_a
208
- pairs
209
- end
210
-
211
- end # class Builder
212
-
213
-
214
- end # module Factbook
@@ -1,92 +0,0 @@
1
- # encoding: utf-8
2
-
3
- module Factbook
4
-
5
- class ItemBuilder ## renameto ItemReader, ItemParser - why? why not??
6
- include LogUtils::Logging
7
- include NormalizeHelper ## e.g. normalize_category
8
-
9
- def initialize( html, name )
10
- @html = html
11
- @name = name # add category/field name e.g. Area, Location, etc.
12
- end
13
-
14
- def read
15
- ## return hash from html snippet
16
- doc = Nokogiri::HTML.fragment( @html )
17
-
18
- data = {}
19
- last_node = nil ## track last hash (always use text key)
20
- last_node_data_count = 0
21
-
22
- ## note:
23
- ## skip whitespace text nodes (e.g. \n\n etc); just use divs
24
- doc.children.filter('div').each_with_index do |child,i|
25
-
26
- if child['class'] == 'category_data'
27
- text = child.text ## fix/todo: use strip
28
- puts "category_data: >#{text}<"
29
-
30
- if last_node.nil?
31
- ## assume its the very first entry; use implied/auto-created category
32
- data['text'] = ''
33
- last_node = data
34
- last_node_data_count = 0
35
- end
36
-
37
- ### first category_data element?
38
- if last_node_data_count == 0
39
- if last_node['text'] == ''
40
- last_node['text'] = text
41
- else ### possible ??? if data_count is zero - not should not include any data
42
- ## todo: issue warning here - why? why not??
43
- last_node['text'] += " #{text}" ## append w/o separator
44
- end
45
- else
46
- if @name == 'Demographic profile' ## special case (use space a sep)
47
- last_node['text'] += " #{text}" ## append without (w/o) separator
48
- else
49
- last_node['text'] += " ++ #{text}" ## append with ++ separator
50
- end
51
- end
52
- last_node_data_count += 1
53
-
54
- elsif child['class'].nil? ## div without any class e.g. <div>..</div>
55
- ## assume category and category_data pair w/ spans
56
- spans = child.children.filter('span')
57
- if spans.size > 2
58
- puts "*** warn: expected two (or one) spans; got #{spans.inspect}"
59
- end
60
-
61
- ## pp spans
62
-
63
- span_key = spans[0] ## assume 1st entry is span.category
64
- span_value = spans[1] ## assume 2nd entry is span.category_data
65
-
66
- key = normalize_category( span_key.text )
67
-
68
- ## note: allow optional category_data for now
69
- value = span_value ? span_value.text : nil
70
-
71
- puts "key: >#{key}<, value: >#{value}< : #{value.class.name}"
72
-
73
- ## start new pair
74
- last_node = data[key] = { 'text' => value }
75
- last_node_data_count = value ? 1 : 0 ## note: set to 1 if value present
76
- else
77
- puts "*** warn: item builder -- unknow css class in #{child.inspect}"
78
- end
79
-
80
- ## pp child
81
- ## css = child['class']
82
- ## puts "[#{i}] #{child.name} class='>#{css}< : #{css.class.name}' >#{child.text}<"
83
- end
84
-
85
- pp data
86
- data
87
- end
88
-
89
-
90
- end # class ItemBuilder
91
-
92
- end # module Factbook