factbook 2.0.0 → 2.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (68) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +1 -1
  3. data/Manifest.txt +0 -61
  4. data/README.md +8 -506
  5. data/Rakefile +4 -9
  6. data/lib/factbook.rb +4 -64
  7. metadata +6 -124
  8. data/data/attributes.yml +0 -337
  9. data/data/categories.csv +0 -164
  10. data/data/codes.csv +0 -262
  11. data/data/codesxref.csv +0 -280
  12. data/data/comparisons.csv +0 -75
  13. data/lib/factbook/almanac.rb +0 -72
  14. data/lib/factbook/attributes.rb +0 -74
  15. data/lib/factbook/builder.rb +0 -212
  16. data/lib/factbook/builder_item.rb +0 -126
  17. data/lib/factbook/builder_json.rb +0 -79
  18. data/lib/factbook/codes.rb +0 -119
  19. data/lib/factbook/comparisons.rb +0 -50
  20. data/lib/factbook/counter.rb +0 -48
  21. data/lib/factbook/db/importer.rb +0 -92
  22. data/lib/factbook/db/models.rb +0 -11
  23. data/lib/factbook/db/schema.rb +0 -36
  24. data/lib/factbook/normalize.rb +0 -43
  25. data/lib/factbook/page.rb +0 -148
  26. data/lib/factbook/page_info.rb +0 -12
  27. data/lib/factbook/reader_json.rb +0 -51
  28. data/lib/factbook/sanitizer.rb +0 -178
  29. data/lib/factbook/sect.rb +0 -29
  30. data/lib/factbook/subsect.rb +0 -18
  31. data/lib/factbook/table.rb +0 -52
  32. data/lib/factbook/utils.rb +0 -85
  33. data/lib/factbook/utils_info.rb +0 -129
  34. data/lib/factbook/version.rb +0 -21
  35. data/script/almanac.rb +0 -48
  36. data/script/attributes.rb +0 -34
  37. data/script/build.rb +0 -28
  38. data/script/counter.rb +0 -145
  39. data/script/json.rb +0 -19
  40. data/script/testbr.rb +0 -33
  41. data/script/testcodes.rb +0 -11
  42. data/test/data/au.html +0 -579
  43. data/test/data/au.yml +0 -8
  44. data/test/data/be.html +0 -596
  45. data/test/data/be.yml +0 -8
  46. data/test/data/json/au.json +0 -892
  47. data/test/data/src/ag.html +0 -716
  48. data/test/data/src/au-2015-09-24.html +0 -2006
  49. data/test/data/src/au.html +0 -658
  50. data/test/data/src/be-2015-09-24.html +0 -2011
  51. data/test/data/src/be.html +0 -648
  52. data/test/helper.rb +0 -11
  53. data/test/test_attribs.rb +0 -87
  54. data/test/test_attribs_def.rb +0 -20
  55. data/test/test_builder.rb +0 -35
  56. data/test/test_codes.rb +0 -76
  57. data/test/test_comparisons.rb +0 -19
  58. data/test/test_convert.rb +0 -30
  59. data/test/test_counter.rb +0 -31
  60. data/test/test_fields.rb +0 -52
  61. data/test/test_importer.rb +0 -56
  62. data/test/test_item_builder.rb +0 -99
  63. data/test/test_json.rb +0 -45
  64. data/test/test_json_builder.rb +0 -25
  65. data/test/test_normalize.rb +0 -23
  66. data/test/test_page.rb +0 -38
  67. data/test/test_sanitizer.rb +0 -39
  68. data/test/test_sanitizer_regex.rb +0 -89
@@ -1,75 +0,0 @@
1
- Num,Category,Name
2
- 2147,Geography,Area
3
- 2119,People and Society,Population
4
- 2002,People and Society,Population growth rate
5
- 2054,People and Society,Birth rate
6
- 2066,People and Society,Death rate
7
- 2112,People and Society,Net migration rate
8
- 2223,People and Society,Maternal mortality rate
9
- 2091,People and Society,Infant mortality rate
10
- 2102,People and Society,Life expectancy at birth
11
- 2127,People and Society,Total fertility rate
12
- 2225,People and Society,Health expenditures
13
- 2155,People and Society,HIV/AIDS - adult prevalence rate
14
- 2156,People and Society,HIV/AIDS - people living with HIV/AIDS
15
- 2157,People and Society,HIV/AIDS - deaths
16
- 2228,People and Society,Obesity - adult prevalence rate
17
- 2224,People and Society,Children under the age of 5 years underweight
18
- 2206,People and Society,Education expenditures
19
- 2229,People and Society,"Unemployment, youth ages 15-24"
20
- 2001,Economy,GDP (purchasing power parity)
21
- 2003,Economy,GDP - real growth rate
22
- 2004,Economy,GDP - per capita (PPP)
23
- 2260,Economy,Gross national saving
24
- 2089,Economy,Industrial production growth rate
25
- 2095,Economy,Labor force
26
- 2129,Economy,Unemployment rate
27
- 2172,Economy,Distribution of family income - Gini index
28
- 2221,Economy,Taxes and other revenues
29
- 2222,Economy,Budget surplus (+) or deficit (-)
30
- 2186,Economy,Public debt
31
- 2092,Economy,Inflation rate (consumer prices)
32
- 2207,Economy,Central bank discount rate
33
- 2208,Economy,Commercial bank prime lending rate
34
- 2214,Economy,Stock of narrow money
35
- 2215,Economy,Stock of broad money
36
- 2211,Economy,Stock of domestic credit
37
- 2200,Economy,Market value of publicly traded shares
38
- 2187,Economy,Current account balance
39
- 2078,Economy,Exports
40
- 2087,Economy,Imports
41
- 2188,Economy,Reserves of foreign exchange and gold
42
- 2079,Economy,Debt - external
43
- 2198,Economy,Stock of direct foreign investment - at home
44
- 2199,Economy,Stock of direct foreign investment - abroad
45
- 2232,Energy,Electricity - production
46
- 2233,Energy,Electricity - consumption
47
- 2234,Energy,Electricity - exports
48
- 2235,Energy,Electricity - imports
49
- 2236,Energy,Electricity - installed generating capacity
50
- 2237,Energy,Electricity - from fossil fuels
51
- 2239,Energy,Electricity - from nuclear fuels
52
- 2238,Energy,Electricity - from hydroelectric plants
53
- 2240,Energy,Electricity - from other renewable sources
54
- 2241,Energy,Crude oil - production
55
- 2242,Energy,Crude oil - exports
56
- 2243,Energy,Crude oil - imports
57
- 2244,Energy,Crude oil - proved reserves
58
- 2245,Energy,Refined petroleum products - production
59
- 2246,Energy,Refined petroleum products - consumption
60
- 2247,Energy,Refined petroleum products - exports
61
- 2248,Energy,Refined petroleum products - imports
62
- 2249,Energy,Natural gas - production
63
- 2250,Energy,Natural gas - consumption
64
- 2251,Energy,Natural gas - exports
65
- 2252,Energy,Natural gas - imports
66
- 2253,Energy,Natural gas - proved reserves
67
- 2150,Communications,Telephones - fixed lines
68
- 2151,Communications,Telephones - mobile cellular
69
- 2153,Communications,Internet users
70
- 2053,Transportation,Airports
71
- 2121,Transportation,Railways
72
- 2085,Transportation,Roadways
73
- 2093,Transportation,Waterways
74
- 2108,Transportation,Merchant marine
75
- 2034,Military,Military expenditures
@@ -1,72 +0,0 @@
1
- # encoding: utf-8
2
-
3
- module Factbook
4
-
5
-
6
- class Almanac
7
-
8
- ## convenience helper ("factory")
9
- def self.from_json( codes, json_dir: '.' )
10
- pages = JsonPageReader.new( json_dir ).read_pages( codes )
11
- self.new( pages )
12
- end
13
-
14
-
15
- def initialize( pages )
16
- @pages = pages
17
- end
18
-
19
- def render( template )
20
- buf = ''
21
- @pages.each do |page|
22
- text = PageCtx.new( page, template ).render
23
-
24
- puts text ## for debugging write country profile to console (too)
25
- buf << text
26
- end
27
- puts "count: #{@pages.count}"
28
- buf ## return buffered almanac text
29
- end
30
-
31
-
32
- class PageCtx
33
- attr_accessor :page
34
-
35
- def initialize(page, template)
36
- @page = page
37
- @template = template
38
- end
39
-
40
- ##############################
41
- ## add some "view helpers"
42
-
43
- def name
44
- ## -- calculate name (use long name if (short) name is not availabe e.g. none)
45
- ## e.g. Austria
46
- if @name.nil?
47
- @name = page.name
48
- @name = page.name_long if @name == 'none'
49
- end
50
- @name
51
- end
52
-
53
- def names( separator: ' • ' )
54
- ## e.g. Austria • Österreich
55
- if @names.nil?
56
- if page.name_local.blank? || page.name_local == 'none' || page.name_local == name
57
- @names = [name] ## no local (in its own non-english language) name
58
- else
59
- @names = [name, page.name_local]
60
- end
61
- end
62
- @names.join( separator )
63
- end
64
-
65
- def render
66
- ERB.new( @template).result( binding )
67
- end
68
- end ## PageCtx
69
-
70
- end ## Almanac
71
-
72
- end # module Factbook
@@ -1,74 +0,0 @@
1
- # encoding: utf-8
2
-
3
-
4
- module Factbook
5
-
6
- class Attributes
7
-
8
- Attribute = Struct.new( :name,
9
- :category, ## e.g. Introduction, Geography, etc.
10
- :path, ## note: is an array e.g. ["Area - comparative"] or ["Area", "land"] etc.
11
- )
12
-
13
- def self.from_yaml( path )
14
-
15
- h = YAML.load_file( path )
16
- pp h
17
-
18
- attribs = []
19
-
20
- ## note: use a copy (e.g. h.dup) for now (hash gets changed by build_attribs!!)
21
- new_h = h.dup
22
- new_h.each do |k,v|
23
- category = k
24
- build_attribs( attribs, category, [], v )
25
- end
26
-
27
- self.new( attribs )
28
- end
29
-
30
-
31
- def self.build_attribs( attribs, category, path, h )
32
-
33
- ## assume it's an attribute definition hash
34
- ## note: !! exclude special cases:
35
- ## Capital -- incl. name key itself
36
- ## National anthem
37
- if h.has_key?( 'name' ) && ['Capital','National anthem'].include?( path[-1] ) == false
38
- a = Attribute.new
39
- a.name = h['name']
40
- a.category = category
41
- a.path = path
42
-
43
- puts " adding attribute >#{a.name}< using #{a.category} / #{a.path.inspect}"
44
- attribs << a
45
-
46
- ## note: make sure a modifable copy (of h) gets passed in
47
- h.delete( 'name' )
48
- end
49
-
50
- return if h.empty? ## empty hash; nothing (more) to do; return
51
-
52
- ## continue walking (recursive)
53
- h.each do |k,v|
54
- new_path = path.dup << k ## note: create a new array (copy)
55
- build_attribs( attribs, category, new_path, v )
56
- end
57
- end
58
-
59
-
60
- def initialize( attribs )
61
- @attribs = attribs
62
- end
63
-
64
- def to_a() @attribs; end
65
- def size() @attribs.size; end
66
-
67
- def each
68
- @attribs.each { |attrib| yield( attrib ) }
69
- end
70
-
71
- end # class Attributes
72
-
73
- end # module Factbook
74
-
@@ -1,212 +0,0 @@
1
- # encoding: utf-8
2
-
3
- module Factbook
4
-
5
- class Builder ## todo: change to PageBuilder ???
6
- include LogUtils::Logging
7
-
8
-
9
- =begin
10
- def self.from_cc( cc, opts={} ) ## rename to from_file_for_country() or from_file_for_cc() or something - why?? why not??
11
- ## check/todo: rename input_dir to just dir or to include ?
12
- ## (there's no output_dir)?? - why? why not?
13
- input_dir = opts[:input_dir] || '.'
14
- self.from_file( "#{input_dir}/#{cc}.html" )
15
- end
16
- =end
17
-
18
-
19
- def self.from_file( path )
20
- html_ascii = File.read( path ) ## fix/todo: use ASCII8BIT/binary reader !!!!!
21
- self.from_string( html_ascii )
22
- end
23
-
24
- def self.from_string( html_ascii ) ## note: expects ASCII-7BIT/BINARY encoding
25
- self.new( html_ascii )
26
- end
27
-
28
-
29
- attr_reader :html_ascii, ## full "original" 1:1 page in "original/ascii8/binary" encoding
30
- :html, ## utf-8 encoded profile
31
- :html_debug, ## html w/ mapping markers - rename to html_markers - why? why not?
32
- :info, ## page info incl. country_name, region_name, last_updated etc.
33
- :errors, ## encoding erros etc.
34
- :sects
35
-
36
-
37
- def initialize( html_ascii )
38
- @html_ascii = html_ascii
39
-
40
- ## todo/fix: use/assume windows 12xx?? encoding - change encoding to utf-8 (from binary/ascii8bit)
41
- @html, @info, @errors = Sanitizer.new.sanitize( @html_ascii )
42
-
43
-
44
- html_sects = if @html.empty?
45
- ## note: support "empty" pages - old format waiting for update!!!
46
- ## cannot parse for now
47
- [] ## return empty (no) sections for now - sorry (its just one page with code cc anyway!!)
48
- else
49
- @html_debug = map_sects( @html )
50
- @html_debug = map_subsects( @html_debug )
51
-
52
- split_sects( @html_debug )
53
- end
54
-
55
- pp html_sects
56
-
57
- ## debug
58
- ## File.open( 'tmp/br.debug.html', 'w:utf-8') { |f| f.write( @html_debug ) }
59
-
60
-
61
- @sects = []
62
- html_sects.each do |html_sect|
63
- html_sect_head = html_sect[0]
64
- html_subsects = html_sect[1]
65
- puts html_sect_head
66
- puts html_subsects.size
67
-
68
- ## get section title
69
- ## @SECTION{Economy} => Economy
70
- if html_sect_head =~ /@SECTION{(.+?)}/
71
- title = $1.strip
72
- puts title
73
- sect = Sect.new
74
- sect.title = title
75
- ## get subsections
76
- subsects = []
77
- html_subsects.each do |html_subsect|
78
- html_subsect_head = html_subsect[0]
79
- html_subsect_body = html_subsect[1]
80
- if html_subsect_head =~ /@SUBSECTION{(.+?)}/
81
- title = $1.strip
82
- title = title.sub( /:\z/, '' ) # remove trailing : if present
83
- title = title.strip
84
-
85
- puts title
86
- subsect = Subsect.new
87
- subsect.title = title ## todo/fix: cut off trailing colon (:)
88
-
89
- b = Factbook::ItemBuilder.new( html_subsect_body, title )
90
- h = b.read
91
- subsect.data = h
92
-
93
- subsects << subsect
94
- else
95
- ## warn/fix: no subsection title found
96
- end
97
- end
98
- sect.subsects = subsects
99
- @sects << sect
100
- else
101
- ## warn/fix: no section title found
102
- end
103
- end
104
-
105
- self ## return self -- needed?? default (standard) anyway?? check and remove
106
- end
107
-
108
-
109
-
110
- def map_sects( html )
111
- ## convert section titles to "unified" marker
112
- ## e.g.
113
- ## <h2>Introduction</h2>
114
-
115
- title_regex= /<h2>
116
- \s*
117
- (.+?) ## note: use non-greedy; do NOT allow tags inside for now
118
- \s*
119
- <\/h2>
120
- /xim
121
-
122
- html = html.gsub( title_regex ) do |m|
123
- puts "** found section >#{$1}<:"
124
- puts " >|#{m}|<"
125
-
126
- "\n\n@SECTION{#{$1}}\n\n"
127
- end
128
- html
129
- end
130
-
131
-
132
- def map_subsects( html )
133
- ## convert subsection titles to "unified" marker
134
- ## e.g.
135
- ## <h3>Disputes - international:</h3>
136
-
137
- title_regex= /<h3>
138
- \s*
139
- (.+?) ## note: use non-greedy; allows tags inside - why? why not
140
- \s*
141
- <\/h3>
142
- /xim
143
-
144
- html = html.gsub( title_regex ) do |m|
145
- puts "** found subsection >#{$1}<:"
146
- puts " >|#{m}|<"
147
-
148
- "\n@SUBSECTION{#{$1}}\n"
149
- end
150
- html
151
- end
152
-
153
-
154
-
155
- def split_sects( html )
156
- ####
157
- # split html in sections (divided by section headings)
158
- # e.g. remove optional prolog ??,
159
- ## [[heading,sect],
160
- ## [heading,sect],
161
- ## [heading,sect],...]
162
-
163
- ## note: "wrap" regex in a capture group (just one)
164
- ## String#split will include all catpure groups in the result array
165
-
166
- section_regex= /(@SECTION{.+?})/ ## note: use non-greedy -- check: need to escape {} ??
167
-
168
- chunks = html.split( section_regex )
169
-
170
- ## check if first item is a section or (html) prolog
171
- # if prolog (remove)
172
- chunks.slice!(0) unless chunks[0] =~ /@SECTION/ ## starts w/ @SECTION
173
-
174
- pairs = chunks.each_slice(2).to_a
175
-
176
- ## now split subsections
177
- newpairs = []
178
- pairs.each do |item|
179
- ## todo: after cleanup prolog; remove @SECTION{} ?? - just keep title - why, why not??
180
- newpairs << [item[0], split_subsects( item[1]) ]
181
- end
182
- newpairs
183
- end
184
-
185
-
186
- def split_subsects( html )
187
- ####
188
- # split html in subsections (divided by subsection headings)
189
- # e.g. remove optional prolog ??,
190
- ## [[heading,sect],
191
- ## [heading,sect],
192
- ## [heading,sect],...]
193
-
194
- ## note: "wrap" regex in a capture group (just one)
195
- ## String#split will include all catpure groups in the result array
196
-
197
- subsection_regex= /(@SUBSECTION{.+?})/ ## note: use non-greedy -- check: need to escape {} ??
198
-
199
- chunks = html.split( subsection_regex )
200
-
201
- ## check if first item is a section or (html) prolog
202
- # if prolog (remove)
203
- chunks.slice!(0) unless chunks[0] =~ /@SUBSECTION/ ## starts w/ @SUBSECTION
204
-
205
- pairs = chunks.each_slice(2).to_a
206
- pairs
207
- end
208
-
209
- end # class Builder
210
-
211
-
212
- end # module Factbook
@@ -1,126 +0,0 @@
1
- # encoding: utf-8
2
-
3
- module Factbook
4
-
5
- class ItemBuilder ## renameto ItemReader, ItemParser - why? why not??
6
- include LogUtils::Logging
7
- include NormalizeHelper ## e.g. normalize_category
8
-
9
- def initialize( html, name )
10
- @html = html
11
- @name = name # add category/field name e.g. Area, Location, etc.
12
- end
13
-
14
-
15
-
16
- ##
17
- ## <div class="category_data subfield text">
18
- ## Portuguese (official and most widely spoken language)
19
- ##
20
- ## </div>
21
- ## <div class="category_data note">
22
- ## <p><strong>note:</strong> less common languages include Spanish (border areas and schools), German, Italian, Japanese, English, and a large number of minor Amerindian languages</p>
23
- ## </div>
24
-
25
-
26
- def read
27
- ## return hash from html snippet
28
- doc = Nokogiri::HTML.fragment( @html )
29
-
30
- data = {}
31
-
32
- ## note:
33
- ## skip whitespace text nodes (e.g. \n\n etc); just use divs
34
- doc_children = doc.children.filter('div')
35
-
36
- puts " parsing >#{@name}< - #{doc_children.size} category_data divs(s):"
37
-
38
- doc_children.each_with_index do |div,i|
39
- if div['class'].index( 'note' )
40
- text = squish( div.text.strip )
41
- puts "category_data: >#{text}<"
42
-
43
- data['note'] = { 'text' => text }
44
- elsif div['class'].index( 'historic' )
45
- ## add all historic together into one for now
46
- text = squish( div.text.strip )
47
- puts "category_data: >#{text}<"
48
-
49
- if i == 0
50
- data['text'] = text
51
- else
52
- ## append with / for now
53
- data['text'] += " / #{text}"
54
- end
55
- elsif div.css( 'span.subfield-name').empty?
56
- ## assume "implied text field"
57
- ## check for index == 1 / child count == 1 - why? why not
58
- text = squish( div.text.strip ) ## fix/todo: use strip
59
- puts "category_data: >#{text}<"
60
-
61
- data['text'] = text
62
-
63
- ## must be always first node for now
64
- if i != 0
65
- puts "!! ERROR - 'implied' category W/O name NOT first div / node:"
66
- puts @html
67
- exit 1
68
- end
69
- elsif div['class'].index( 'grouped_subfield' )
70
- ## split grouped subfield!!
71
- ## <span class="subfield-name">arable land:</span>
72
- ## <span class="subfield-number">8.6%</span>
73
- ## <span class="subfield-date">(2011 est.)</span>
74
- ## /
75
- ## <span class="subfield-name">permanent crops:</span>
76
- ## <span class="subfield-number">0.8%</span>
77
- ## <span class="subfield-date">(2011 est.)</span>
78
- ## /
79
- ## <span class="subfield-name">permanent pasture:</span>
80
- ## <span class="subfield-number">23.5%</span>
81
- ## <span class="subfield-date">(2011 est.)</span>
82
-
83
- ## join names for now - why? why not?
84
- ## e.g. becomes:
85
- ## arable land / permanent crops / permanent pasture: for key ??
86
- span_names = div.css( 'span.subfield-name')
87
- keys = []
88
- span_names.each do |span|
89
- keys << normalize_category( span.text.strip )
90
- span.replace( '' )
91
- end
92
- key = keys.join( ' / ')
93
- text = squish( div.text.strip )
94
- puts "category_data key >#{key}<: >#{text}<"
95
- data[ key ] = { 'text' => text }
96
- else
97
- ## get subfield name
98
- span_names = div.css( 'span.subfield-name')
99
- if span_names.size > 1
100
- puts "!! ERROR - found more than one subfield-name:"
101
- puts div.to_html
102
- exit 1
103
- end
104
- key = normalize_category( span_names[0].text.strip )
105
- span_names[0].replace( '' )
106
-
107
- text = squish( div.text.strip )
108
- puts "category_data key >#{key}<: >#{text}<"
109
- data[ key ] = { 'text' => text }
110
- end
111
- end
112
-
113
-
114
- pp data
115
- data
116
- end
117
-
118
-
119
-
120
- def squish( str )
121
- str.gsub( /[ \t\n\r]{2,}/, ' ') ## replace multi-spaces (incl. newlines with once space)
122
- end
123
-
124
- end # class ItemBuilder
125
-
126
- end # module Factbook