factbook 2.0.0 → 2.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +1 -1
  3. data/Manifest.txt +0 -61
  4. data/README.md +8 -506
  5. data/Rakefile +4 -9
  6. data/lib/factbook.rb +4 -64
  7. metadata +6 -124
  8. data/data/attributes.yml +0 -337
  9. data/data/categories.csv +0 -164
  10. data/data/codes.csv +0 -262
  11. data/data/codesxref.csv +0 -280
  12. data/data/comparisons.csv +0 -75
  13. data/lib/factbook/almanac.rb +0 -72
  14. data/lib/factbook/attributes.rb +0 -74
  15. data/lib/factbook/builder.rb +0 -212
  16. data/lib/factbook/builder_item.rb +0 -126
  17. data/lib/factbook/builder_json.rb +0 -79
  18. data/lib/factbook/codes.rb +0 -119
  19. data/lib/factbook/comparisons.rb +0 -50
  20. data/lib/factbook/counter.rb +0 -48
  21. data/lib/factbook/db/importer.rb +0 -92
  22. data/lib/factbook/db/models.rb +0 -11
  23. data/lib/factbook/db/schema.rb +0 -36
  24. data/lib/factbook/normalize.rb +0 -43
  25. data/lib/factbook/page.rb +0 -148
  26. data/lib/factbook/page_info.rb +0 -12
  27. data/lib/factbook/reader_json.rb +0 -51
  28. data/lib/factbook/sanitizer.rb +0 -178
  29. data/lib/factbook/sect.rb +0 -29
  30. data/lib/factbook/subsect.rb +0 -18
  31. data/lib/factbook/table.rb +0 -52
  32. data/lib/factbook/utils.rb +0 -85
  33. data/lib/factbook/utils_info.rb +0 -129
  34. data/lib/factbook/version.rb +0 -21
  35. data/script/almanac.rb +0 -48
  36. data/script/attributes.rb +0 -34
  37. data/script/build.rb +0 -28
  38. data/script/counter.rb +0 -145
  39. data/script/json.rb +0 -19
  40. data/script/testbr.rb +0 -33
  41. data/script/testcodes.rb +0 -11
  42. data/test/data/au.html +0 -579
  43. data/test/data/au.yml +0 -8
  44. data/test/data/be.html +0 -596
  45. data/test/data/be.yml +0 -8
  46. data/test/data/json/au.json +0 -892
  47. data/test/data/src/ag.html +0 -716
  48. data/test/data/src/au-2015-09-24.html +0 -2006
  49. data/test/data/src/au.html +0 -658
  50. data/test/data/src/be-2015-09-24.html +0 -2011
  51. data/test/data/src/be.html +0 -648
  52. data/test/helper.rb +0 -11
  53. data/test/test_attribs.rb +0 -87
  54. data/test/test_attribs_def.rb +0 -20
  55. data/test/test_builder.rb +0 -35
  56. data/test/test_codes.rb +0 -76
  57. data/test/test_comparisons.rb +0 -19
  58. data/test/test_convert.rb +0 -30
  59. data/test/test_counter.rb +0 -31
  60. data/test/test_fields.rb +0 -52
  61. data/test/test_importer.rb +0 -56
  62. data/test/test_item_builder.rb +0 -99
  63. data/test/test_json.rb +0 -45
  64. data/test/test_json_builder.rb +0 -25
  65. data/test/test_normalize.rb +0 -23
  66. data/test/test_page.rb +0 -38
  67. data/test/test_sanitizer.rb +0 -39
  68. data/test/test_sanitizer_regex.rb +0 -89
@@ -1,75 +0,0 @@
1
- Num,Category,Name
2
- 2147,Geography,Area
3
- 2119,People and Society,Population
4
- 2002,People and Society,Population growth rate
5
- 2054,People and Society,Birth rate
6
- 2066,People and Society,Death rate
7
- 2112,People and Society,Net migration rate
8
- 2223,People and Society,Maternal mortality rate
9
- 2091,People and Society,Infant mortality rate
10
- 2102,People and Society,Life expectancy at birth
11
- 2127,People and Society,Total fertility rate
12
- 2225,People and Society,Health expenditures
13
- 2155,People and Society,HIV/AIDS - adult prevalence rate
14
- 2156,People and Society,HIV/AIDS - people living with HIV/AIDS
15
- 2157,People and Society,HIV/AIDS - deaths
16
- 2228,People and Society,Obesity - adult prevalence rate
17
- 2224,People and Society,Children under the age of 5 years underweight
18
- 2206,People and Society,Education expenditures
19
- 2229,People and Society,"Unemployment, youth ages 15-24"
20
- 2001,Economy,GDP (purchasing power parity)
21
- 2003,Economy,GDP - real growth rate
22
- 2004,Economy,GDP - per capita (PPP)
23
- 2260,Economy,Gross national saving
24
- 2089,Economy,Industrial production growth rate
25
- 2095,Economy,Labor force
26
- 2129,Economy,Unemployment rate
27
- 2172,Economy,Distribution of family income - Gini index
28
- 2221,Economy,Taxes and other revenues
29
- 2222,Economy,Budget surplus (+) or deficit (-)
30
- 2186,Economy,Public debt
31
- 2092,Economy,Inflation rate (consumer prices)
32
- 2207,Economy,Central bank discount rate
33
- 2208,Economy,Commercial bank prime lending rate
34
- 2214,Economy,Stock of narrow money
35
- 2215,Economy,Stock of broad money
36
- 2211,Economy,Stock of domestic credit
37
- 2200,Economy,Market value of publicly traded shares
38
- 2187,Economy,Current account balance
39
- 2078,Economy,Exports
40
- 2087,Economy,Imports
41
- 2188,Economy,Reserves of foreign exchange and gold
42
- 2079,Economy,Debt - external
43
- 2198,Economy,Stock of direct foreign investment - at home
44
- 2199,Economy,Stock of direct foreign investment - abroad
45
- 2232,Energy,Electricity - production
46
- 2233,Energy,Electricity - consumption
47
- 2234,Energy,Electricity - exports
48
- 2235,Energy,Electricity - imports
49
- 2236,Energy,Electricity - installed generating capacity
50
- 2237,Energy,Electricity - from fossil fuels
51
- 2239,Energy,Electricity - from nuclear fuels
52
- 2238,Energy,Electricity - from hydroelectric plants
53
- 2240,Energy,Electricity - from other renewable sources
54
- 2241,Energy,Crude oil - production
55
- 2242,Energy,Crude oil - exports
56
- 2243,Energy,Crude oil - imports
57
- 2244,Energy,Crude oil - proved reserves
58
- 2245,Energy,Refined petroleum products - production
59
- 2246,Energy,Refined petroleum products - consumption
60
- 2247,Energy,Refined petroleum products - exports
61
- 2248,Energy,Refined petroleum products - imports
62
- 2249,Energy,Natural gas - production
63
- 2250,Energy,Natural gas - consumption
64
- 2251,Energy,Natural gas - exports
65
- 2252,Energy,Natural gas - imports
66
- 2253,Energy,Natural gas - proved reserves
67
- 2150,Communications,Telephones - fixed lines
68
- 2151,Communications,Telephones - mobile cellular
69
- 2153,Communications,Internet users
70
- 2053,Transportation,Airports
71
- 2121,Transportation,Railways
72
- 2085,Transportation,Roadways
73
- 2093,Transportation,Waterways
74
- 2108,Transportation,Merchant marine
75
- 2034,Military,Military expenditures
@@ -1,72 +0,0 @@
1
- # encoding: utf-8
2
-
3
- module Factbook
4
-
5
-
6
- class Almanac
7
-
8
- ## convenience helper ("factory")
9
- def self.from_json( codes, json_dir: '.' )
10
- pages = JsonPageReader.new( json_dir ).read_pages( codes )
11
- self.new( pages )
12
- end
13
-
14
-
15
- def initialize( pages )
16
- @pages = pages
17
- end
18
-
19
- def render( template )
20
- buf = ''
21
- @pages.each do |page|
22
- text = PageCtx.new( page, template ).render
23
-
24
- puts text ## for debugging write country profile to console (too)
25
- buf << text
26
- end
27
- puts "count: #{@pages.count}"
28
- buf ## return buffered almanac text
29
- end
30
-
31
-
32
- class PageCtx
33
- attr_accessor :page
34
-
35
- def initialize(page, template)
36
- @page = page
37
- @template = template
38
- end
39
-
40
- ##############################
41
- ## add some "view helpers"
42
-
43
- def name
44
- ## -- calculate name (use long name if (short) name is not availabe e.g. none)
45
- ## e.g. Austria
46
- if @name.nil?
47
- @name = page.name
48
- @name = page.name_long if @name == 'none'
49
- end
50
- @name
51
- end
52
-
53
- def names( separator: ' • ' )
54
- ## e.g. Austria • Österreich
55
- if @names.nil?
56
- if page.name_local.blank? || page.name_local == 'none' || page.name_local == name
57
- @names = [name] ## no local (in its own non-english language) name
58
- else
59
- @names = [name, page.name_local]
60
- end
61
- end
62
- @names.join( separator )
63
- end
64
-
65
- def render
66
- ERB.new( @template).result( binding )
67
- end
68
- end ## PageCtx
69
-
70
- end ## Almanac
71
-
72
- end # module Factbook
@@ -1,74 +0,0 @@
1
- # encoding: utf-8
2
-
3
-
4
- module Factbook
5
-
6
- class Attributes
7
-
8
- Attribute = Struct.new( :name,
9
- :category, ## e.g. Introduction, Geography, etc.
10
- :path, ## note: is an array e.g. ["Area - comparative"] or ["Area", "land"] etc.
11
- )
12
-
13
- def self.from_yaml( path )
14
-
15
- h = YAML.load_file( path )
16
- pp h
17
-
18
- attribs = []
19
-
20
- ## note: use a copy (e.g. h.dup) for now (hash gets changed by build_attribs!!)
21
- new_h = h.dup
22
- new_h.each do |k,v|
23
- category = k
24
- build_attribs( attribs, category, [], v )
25
- end
26
-
27
- self.new( attribs )
28
- end
29
-
30
-
31
- def self.build_attribs( attribs, category, path, h )
32
-
33
- ## assume it's an attribute definition hash
34
- ## note: !! exclude special cases:
35
- ## Capital -- incl. name key itself
36
- ## National anthem
37
- if h.has_key?( 'name' ) && ['Capital','National anthem'].include?( path[-1] ) == false
38
- a = Attribute.new
39
- a.name = h['name']
40
- a.category = category
41
- a.path = path
42
-
43
- puts " adding attribute >#{a.name}< using #{a.category} / #{a.path.inspect}"
44
- attribs << a
45
-
46
- ## note: make sure a modifable copy (of h) gets passed in
47
- h.delete( 'name' )
48
- end
49
-
50
- return if h.empty? ## empty hash; nothing (more) to do; return
51
-
52
- ## continue walking (recursive)
53
- h.each do |k,v|
54
- new_path = path.dup << k ## note: create a new array (copy)
55
- build_attribs( attribs, category, new_path, v )
56
- end
57
- end
58
-
59
-
60
- def initialize( attribs )
61
- @attribs = attribs
62
- end
63
-
64
- def to_a() @attribs; end
65
- def size() @attribs.size; end
66
-
67
- def each
68
- @attribs.each { |attrib| yield( attrib ) }
69
- end
70
-
71
- end # class Attributes
72
-
73
- end # module Factbook
74
-
@@ -1,212 +0,0 @@
1
- # encoding: utf-8
2
-
3
- module Factbook
4
-
5
- class Builder ## todo: change to PageBuilder ???
6
- include LogUtils::Logging
7
-
8
-
9
- =begin
10
- def self.from_cc( cc, opts={} ) ## rename to from_file_for_country() or from_file_for_cc() or something - why?? why not??
11
- ## check/todo: rename input_dir to just dir or to include ?
12
- ## (there's no output_dir)?? - why? why not?
13
- input_dir = opts[:input_dir] || '.'
14
- self.from_file( "#{input_dir}/#{cc}.html" )
15
- end
16
- =end
17
-
18
-
19
- def self.from_file( path )
20
- html_ascii = File.read( path ) ## fix/todo: use ASCII8BIT/binary reader !!!!!
21
- self.from_string( html_ascii )
22
- end
23
-
24
- def self.from_string( html_ascii ) ## note: expects ASCII-7BIT/BINARY encoding
25
- self.new( html_ascii )
26
- end
27
-
28
-
29
- attr_reader :html_ascii, ## full "original" 1:1 page in "original/ascii8/binary" encoding
30
- :html, ## utf-8 encoded profile
31
- :html_debug, ## html w/ mapping markers - rename to html_markers - why? why not?
32
- :info, ## page info incl. country_name, region_name, last_updated etc.
33
- :errors, ## encoding erros etc.
34
- :sects
35
-
36
-
37
- def initialize( html_ascii )
38
- @html_ascii = html_ascii
39
-
40
- ## todo/fix: use/assume windows 12xx?? encoding - change encoding to utf-8 (from binary/ascii8bit)
41
- @html, @info, @errors = Sanitizer.new.sanitize( @html_ascii )
42
-
43
-
44
- html_sects = if @html.empty?
45
- ## note: support "empty" pages - old format waiting for update!!!
46
- ## cannot parse for now
47
- [] ## return empty (no) sections for now - sorry (its just one page with code cc anyway!!)
48
- else
49
- @html_debug = map_sects( @html )
50
- @html_debug = map_subsects( @html_debug )
51
-
52
- split_sects( @html_debug )
53
- end
54
-
55
- pp html_sects
56
-
57
- ## debug
58
- ## File.open( 'tmp/br.debug.html', 'w:utf-8') { |f| f.write( @html_debug ) }
59
-
60
-
61
- @sects = []
62
- html_sects.each do |html_sect|
63
- html_sect_head = html_sect[0]
64
- html_subsects = html_sect[1]
65
- puts html_sect_head
66
- puts html_subsects.size
67
-
68
- ## get section title
69
- ## @SECTION{Economy} => Economy
70
- if html_sect_head =~ /@SECTION{(.+?)}/
71
- title = $1.strip
72
- puts title
73
- sect = Sect.new
74
- sect.title = title
75
- ## get subsections
76
- subsects = []
77
- html_subsects.each do |html_subsect|
78
- html_subsect_head = html_subsect[0]
79
- html_subsect_body = html_subsect[1]
80
- if html_subsect_head =~ /@SUBSECTION{(.+?)}/
81
- title = $1.strip
82
- title = title.sub( /:\z/, '' ) # remove trailing : if present
83
- title = title.strip
84
-
85
- puts title
86
- subsect = Subsect.new
87
- subsect.title = title ## todo/fix: cut off trailing colon (:)
88
-
89
- b = Factbook::ItemBuilder.new( html_subsect_body, title )
90
- h = b.read
91
- subsect.data = h
92
-
93
- subsects << subsect
94
- else
95
- ## warn/fix: no subsection title found
96
- end
97
- end
98
- sect.subsects = subsects
99
- @sects << sect
100
- else
101
- ## warn/fix: no section title found
102
- end
103
- end
104
-
105
- self ## return self -- needed?? default (standard) anyway?? check and remove
106
- end
107
-
108
-
109
-
110
- def map_sects( html )
111
- ## convert section titles to "unified" marker
112
- ## e.g.
113
- ## <h2>Introduction</h2>
114
-
115
- title_regex= /<h2>
116
- \s*
117
- (.+?) ## note: use non-greedy; do NOT allow tags inside for now
118
- \s*
119
- <\/h2>
120
- /xim
121
-
122
- html = html.gsub( title_regex ) do |m|
123
- puts "** found section >#{$1}<:"
124
- puts " >|#{m}|<"
125
-
126
- "\n\n@SECTION{#{$1}}\n\n"
127
- end
128
- html
129
- end
130
-
131
-
132
- def map_subsects( html )
133
- ## convert subsection titles to "unified" marker
134
- ## e.g.
135
- ## <h3>Disputes - international:</h3>
136
-
137
- title_regex= /<h3>
138
- \s*
139
- (.+?) ## note: use non-greedy; allows tags inside - why? why not
140
- \s*
141
- <\/h3>
142
- /xim
143
-
144
- html = html.gsub( title_regex ) do |m|
145
- puts "** found subsection >#{$1}<:"
146
- puts " >|#{m}|<"
147
-
148
- "\n@SUBSECTION{#{$1}}\n"
149
- end
150
- html
151
- end
152
-
153
-
154
-
155
- def split_sects( html )
156
- ####
157
- # split html in sections (divided by section headings)
158
- # e.g. remove optional prolog ??,
159
- ## [[heading,sect],
160
- ## [heading,sect],
161
- ## [heading,sect],...]
162
-
163
- ## note: "wrap" regex in a capture group (just one)
164
- ## String#split will include all catpure groups in the result array
165
-
166
- section_regex= /(@SECTION{.+?})/ ## note: use non-greedy -- check: need to escape {} ??
167
-
168
- chunks = html.split( section_regex )
169
-
170
- ## check if first item is a section or (html) prolog
171
- # if prolog (remove)
172
- chunks.slice!(0) unless chunks[0] =~ /@SECTION/ ## starts w/ @SECTION
173
-
174
- pairs = chunks.each_slice(2).to_a
175
-
176
- ## now split subsections
177
- newpairs = []
178
- pairs.each do |item|
179
- ## todo: after cleanup prolog; remove @SECTION{} ?? - just keep title - why, why not??
180
- newpairs << [item[0], split_subsects( item[1]) ]
181
- end
182
- newpairs
183
- end
184
-
185
-
186
- def split_subsects( html )
187
- ####
188
- # split html in subsections (divided by subsection headings)
189
- # e.g. remove optional prolog ??,
190
- ## [[heading,sect],
191
- ## [heading,sect],
192
- ## [heading,sect],...]
193
-
194
- ## note: "wrap" regex in a capture group (just one)
195
- ## String#split will include all catpure groups in the result array
196
-
197
- subsection_regex= /(@SUBSECTION{.+?})/ ## note: use non-greedy -- check: need to escape {} ??
198
-
199
- chunks = html.split( subsection_regex )
200
-
201
- ## check if first item is a section or (html) prolog
202
- # if prolog (remove)
203
- chunks.slice!(0) unless chunks[0] =~ /@SUBSECTION/ ## starts w/ @SUBSECTION
204
-
205
- pairs = chunks.each_slice(2).to_a
206
- pairs
207
- end
208
-
209
- end # class Builder
210
-
211
-
212
- end # module Factbook
@@ -1,126 +0,0 @@
1
- # encoding: utf-8
2
-
3
- module Factbook
4
-
5
- class ItemBuilder ## renameto ItemReader, ItemParser - why? why not??
6
- include LogUtils::Logging
7
- include NormalizeHelper ## e.g. normalize_category
8
-
9
- def initialize( html, name )
10
- @html = html
11
- @name = name # add category/field name e.g. Area, Location, etc.
12
- end
13
-
14
-
15
-
16
- ##
17
- ## <div class="category_data subfield text">
18
- ## Portuguese (official and most widely spoken language)
19
- ##
20
- ## </div>
21
- ## <div class="category_data note">
22
- ## <p><strong>note:</strong> less common languages include Spanish (border areas and schools), German, Italian, Japanese, English, and a large number of minor Amerindian languages</p>
23
- ## </div>
24
-
25
-
26
- def read
27
- ## return hash from html snippet
28
- doc = Nokogiri::HTML.fragment( @html )
29
-
30
- data = {}
31
-
32
- ## note:
33
- ## skip whitespace text nodes (e.g. \n\n etc); just use divs
34
- doc_children = doc.children.filter('div')
35
-
36
- puts " parsing >#{@name}< - #{doc_children.size} category_data divs(s):"
37
-
38
- doc_children.each_with_index do |div,i|
39
- if div['class'].index( 'note' )
40
- text = squish( div.text.strip )
41
- puts "category_data: >#{text}<"
42
-
43
- data['note'] = { 'text' => text }
44
- elsif div['class'].index( 'historic' )
45
- ## add all historic together into one for now
46
- text = squish( div.text.strip )
47
- puts "category_data: >#{text}<"
48
-
49
- if i == 0
50
- data['text'] = text
51
- else
52
- ## append with / for now
53
- data['text'] += " / #{text}"
54
- end
55
- elsif div.css( 'span.subfield-name').empty?
56
- ## assume "implied text field"
57
- ## check for index == 1 / child count == 1 - why? why not
58
- text = squish( div.text.strip ) ## fix/todo: use strip
59
- puts "category_data: >#{text}<"
60
-
61
- data['text'] = text
62
-
63
- ## must be always first node for now
64
- if i != 0
65
- puts "!! ERROR - 'implied' category W/O name NOT first div / node:"
66
- puts @html
67
- exit 1
68
- end
69
- elsif div['class'].index( 'grouped_subfield' )
70
- ## split grouped subfield!!
71
- ## <span class="subfield-name">arable land:</span>
72
- ## <span class="subfield-number">8.6%</span>
73
- ## <span class="subfield-date">(2011 est.)</span>
74
- ## /
75
- ## <span class="subfield-name">permanent crops:</span>
76
- ## <span class="subfield-number">0.8%</span>
77
- ## <span class="subfield-date">(2011 est.)</span>
78
- ## /
79
- ## <span class="subfield-name">permanent pasture:</span>
80
- ## <span class="subfield-number">23.5%</span>
81
- ## <span class="subfield-date">(2011 est.)</span>
82
-
83
- ## join names for now - why? why not?
84
- ## e.g. becomes:
85
- ## arable land / permanent crops / permanent pasture: for key ??
86
- span_names = div.css( 'span.subfield-name')
87
- keys = []
88
- span_names.each do |span|
89
- keys << normalize_category( span.text.strip )
90
- span.replace( '' )
91
- end
92
- key = keys.join( ' / ')
93
- text = squish( div.text.strip )
94
- puts "category_data key >#{key}<: >#{text}<"
95
- data[ key ] = { 'text' => text }
96
- else
97
- ## get subfield name
98
- span_names = div.css( 'span.subfield-name')
99
- if span_names.size > 1
100
- puts "!! ERROR - found more than one subfield-name:"
101
- puts div.to_html
102
- exit 1
103
- end
104
- key = normalize_category( span_names[0].text.strip )
105
- span_names[0].replace( '' )
106
-
107
- text = squish( div.text.strip )
108
- puts "category_data key >#{key}<: >#{text}<"
109
- data[ key ] = { 'text' => text }
110
- end
111
- end
112
-
113
-
114
- pp data
115
- data
116
- end
117
-
118
-
119
-
120
- def squish( str )
121
- str.gsub( /[ \t\n\r]{2,}/, ' ') ## replace multi-spaces (incl. newlines with once space)
122
- end
123
-
124
- end # class ItemBuilder
125
-
126
- end # module Factbook