factbook 0.1.3 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. checksums.yaml +7 -0
  2. data/Manifest.txt +34 -22
  3. data/README.md +8 -3
  4. data/Rakefile +2 -263
  5. data/data/codes.csv +262 -0
  6. data/data/comparisons.csv +75 -0
  7. data/lib/factbook/builder.rb +214 -0
  8. data/lib/factbook/builder_item.rb +93 -0
  9. data/lib/factbook/codes.rb +119 -0
  10. data/lib/factbook/comparisons.rb +50 -0
  11. data/lib/factbook/page.rb +103 -303
  12. data/lib/factbook/sanitizer.rb +214 -0
  13. data/lib/factbook/sect.rb +29 -196
  14. data/lib/factbook/subsect.rb +18 -0
  15. data/lib/factbook/table.rb +52 -0
  16. data/lib/factbook/utils.rb +85 -0
  17. data/lib/factbook/utils_info.rb +102 -0
  18. data/lib/factbook/version.rb +4 -3
  19. data/lib/factbook.rb +23 -1
  20. data/test/data/au.html +579 -0
  21. data/test/data/au.yml +8 -0
  22. data/test/data/be.html +596 -0
  23. data/test/data/be.yml +8 -0
  24. data/test/data/src/au.html +2006 -0
  25. data/test/data/src/be.html +2011 -0
  26. data/test/helper.rb +0 -4
  27. data/test/test_builder.rb +37 -0
  28. data/test/test_codes.rb +76 -0
  29. data/test/test_comparisons.rb +19 -0
  30. data/test/test_fields.rb +21 -18
  31. data/test/test_item_builder.rb +99 -0
  32. data/test/test_json.rb +17 -20
  33. data/test/test_page.rb +18 -10
  34. data/test/test_sanitizer.rb +35 -0
  35. metadata +68 -49
  36. data/.gemtest +0 -0
  37. data/test/data/countrytemplate_au.html +0 -4179
  38. data/test/data/countrytemplate_be.html +0 -4260
  39. data/test/data/countrytemplate_br.html +0 -4366
  40. data/test/data/countrytemplate_ee.html +0 -2999
  41. data/test/data/countrytemplate_ls.html +0 -2728
  42. data/test/data/countrytemplate_mx.html +0 -4397
  43. data/test/data/countrytemplate_vt.html +0 -1726
  44. data/test/data/countrytemplate_xx.html +0 -2898
  45. data/test/test_page_old.rb +0 -478
  46. data/test/test_strip.rb +0 -66
@@ -0,0 +1,75 @@
1
+ Num,Category,Name
2
+ 2147,Geography,Area
3
+ 2119,People and Society,Population
4
+ 2002,People and Society,Population growth rate
5
+ 2054,People and Society,Birth rate
6
+ 2066,People and Society,Death rate
7
+ 2112,People and Society,Net migration rate
8
+ 2223,People and Society,Maternal mortality rate
9
+ 2091,People and Society,Infant mortality rate
10
+ 2102,People and Society,Life expectancy at birth
11
+ 2127,People and Society,Total fertility rate
12
+ 2225,People and Society,Health expenditures
13
+ 2155,People and Society,HIV/AIDS - adult prevalence rate
14
+ 2156,People and Society,HIV/AIDS - people living with HIV/AIDS
15
+ 2157,People and Society,HIV/AIDS - deaths
16
+ 2228,People and Society,Obesity - adult prevalence rate
17
+ 2224,People and Society,Children under the age of 5 years underweight
18
+ 2206,People and Society,Education expenditures
19
+ 2229,People and Society,"Unemployment, youth ages 15-24"
20
+ 2001,Economy,GDP (purchasing power parity)
21
+ 2003,Economy,GDP - real growth rate
22
+ 2004,Economy,GDP - per capita (PPP)
23
+ 2260,Economy,Gross national saving
24
+ 2089,Economy,Industrial production growth rate
25
+ 2095,Economy,Labor force
26
+ 2129,Economy,Unemployment rate
27
+ 2172,Economy,Distribution of family income - Gini index
28
+ 2221,Economy,Taxes and other revenues
29
+ 2222,Economy,Budget surplus (+) or deficit (-)
30
+ 2186,Economy,Public debt
31
+ 2092,Economy,Inflation rate (consumer prices)
32
+ 2207,Economy,Central bank discount rate
33
+ 2208,Economy,Commercial bank prime lending rate
34
+ 2214,Economy,Stock of narrow money
35
+ 2215,Economy,Stock of broad money
36
+ 2211,Economy,Stock of domestic credit
37
+ 2200,Economy,Market value of publicly traded shares
38
+ 2187,Economy,Current account balance
39
+ 2078,Economy,Exports
40
+ 2087,Economy,Imports
41
+ 2188,Economy,Reserves of foreign exchange and gold
42
+ 2079,Economy,Debt - external
43
+ 2198,Economy,Stock of direct foreign investment - at home
44
+ 2199,Economy,Stock of direct foreign investment - abroad
45
+ 2232,Energy,Electricity - production
46
+ 2233,Energy,Electricity - consumption
47
+ 2234,Energy,Electricity - exports
48
+ 2235,Energy,Electricity - imports
49
+ 2236,Energy,Electricity - installed generating capacity
50
+ 2237,Energy,Electricity - from fossil fuels
51
+ 2239,Energy,Electricity - from nuclear fuels
52
+ 2238,Energy,Electricity - from hydroelectric plants
53
+ 2240,Energy,Electricity - from other renewable sources
54
+ 2241,Energy,Crude oil - production
55
+ 2242,Energy,Crude oil - exports
56
+ 2243,Energy,Crude oil - imports
57
+ 2244,Energy,Crude oil - proved reserves
58
+ 2245,Energy,Refined petroleum products - production
59
+ 2246,Energy,Refined petroleum products - consumption
60
+ 2247,Energy,Refined petroleum products - exports
61
+ 2248,Energy,Refined petroleum products - imports
62
+ 2249,Energy,Natural gas - production
63
+ 2250,Energy,Natural gas - consumption
64
+ 2251,Energy,Natural gas - exports
65
+ 2252,Energy,Natural gas - imports
66
+ 2253,Energy,Natural gas - proved reserves
67
+ 2150,Communications,Telephones - fixed lines
68
+ 2151,Communications,Telephones - mobile cellular
69
+ 2153,Communications,Internet users
70
+ 2053,Transportation,Airports
71
+ 2121,Transportation,Railways
72
+ 2085,Transportation,Roadways
73
+ 2093,Transportation,Waterways
74
+ 2108,Transportation,Merchant marine
75
+ 2034,Military,Military expenditures
@@ -0,0 +1,214 @@
1
+ # encoding: utf-8
2
+
3
+ module Factbook
4
+
5
+ class Builder ## todo: change to PageBuilder ???
6
+ include LogUtils::Logging
7
+
8
+
9
+ =begin
10
+ def self.from_cc( cc, opts={} ) ## rename to from_file_for_country() or from_file_for_cc() or something - why?? why not??
11
+ ## check/todo: rename input_dir to just dir or to include ?
12
+ ## (there's no output_dir)?? - why? why not?
13
+ input_dir = opts[:input_dir] || '.'
14
+ self.from_file( "#{input_dir}/#{cc}.html" )
15
+ end
16
+ =end
17
+
18
+
19
+ def self.from_file( path )
20
+ html_ascii = File.read( path ) ## fix/todo: use ASCII8BIT/binary reader !!!!!
21
+ self.new( html_ascii )
22
+ end
23
+
24
+
25
+ attr_reader :html_ascii, ## full "original" 1:1 page in "original/ascii8/binary" encoding
26
+ :html, ## utf-8 encoded profile
27
+ :html_debug, ## html w/ mapping markers - rename to html_markers - why? why not?
28
+ :page_info, ## incl. country_name, region_name, last_updated etc.
29
+ :errors, ## encoding erros etc.
30
+ :page
31
+
32
+ def initialize( html_ascii )
33
+ @html_ascii = html_ascii
34
+
35
+ ## todo/fix: use/assume windows 12xx?? encoding - change encoding to utf-8 (from binary/ascii8bit)
36
+ @html, @page_info, @errors = Sanitizer.new.sanitize( @html_ascii )
37
+
38
+ @html_debug = map_sects( @html )
39
+ @html_debug = map_subsects( @html_debug )
40
+
41
+ html_sects = split_sects( @html_debug )
42
+ pp html_sects
43
+
44
+
45
+ page = Page.new
46
+ sects = []
47
+ html_sects.each do |html_sect|
48
+ html_sect_head = html_sect[0]
49
+ html_subsects = html_sect[1]
50
+ puts html_sect_head
51
+ puts html_subsects.size
52
+
53
+ ## get section title
54
+ ## @SECTION{Economy} => Economy
55
+ if html_sect_head =~ /@SECTION{(.+?)}/
56
+ title = $1.strip
57
+ puts title
58
+ sect = Sect.new
59
+ sect.title = title
60
+ ## get subsections
61
+ subsects = []
62
+ html_subsects.each do |html_subsect|
63
+ html_subsect_head = html_subsect[0]
64
+ html_subsect_body = html_subsect[1]
65
+ if html_subsect_head =~ /@SUBSECTION{(.+?)}/
66
+ title = $1.strip
67
+ title = title.sub( /:\z/, '' ) # remove trailing : if present
68
+ title = title.strip
69
+
70
+ puts title
71
+ subsect = Subsect.new
72
+ subsect.title = title ## todo/fix: cut off trailing colon (:)
73
+
74
+ b = Factbook::ItemBuilder.new( html_subsect_body, title )
75
+ h = b.read
76
+ subsect.data = h
77
+
78
+ subsects << subsect
79
+ else
80
+ ## warn/fix: no subsection title found
81
+ end
82
+ end
83
+ sect.subsects = subsects
84
+ sects << sect
85
+ else
86
+ ## warn/fix: no section title found
87
+ end
88
+ end
89
+ page.sects = sects
90
+ @page = page
91
+
92
+ pp page
93
+
94
+ self ## return self -- needed?? default (standard) anyway?? check and remove
95
+ end
96
+
97
+
98
+
99
+ def map_sects( html )
100
+ ## convert section titles
101
+ ## from <h2>..</h2>
102
+ ## to "unified" marker
103
+
104
+ ## e.g.
105
+ ## <h2 sectiontitle='Introduction' ccode='au'>Introduction :: <span class='region'>AUSTRIA </span></h2>
106
+ ## <h2>Introduction</h2>
107
+
108
+ title_regex= /<h2
109
+ (?:\s[^>]+)? ## allow optional attributes in h2
110
+ >
111
+ \s*
112
+ ([^<>]+?) ## note: use non-greedy; do NOT allow tags inside for now
113
+ \s*
114
+ (?:\s::\s
115
+ .+? ## note: use non-greedy; allows tags inside
116
+ )? ## strip optional name (e.g. :: AUSTRIA)
117
+ <\/h2>
118
+ /xim
119
+
120
+ html = html.gsub( title_regex ) do |m|
121
+ puts "** found section >#{$1}<:"
122
+ puts " >|#{m}|<"
123
+
124
+ "\n\n@SECTION{#{$1}}\n\n"
125
+ end
126
+ html
127
+ end
128
+
129
+
130
+ def map_subsects( html )
131
+ ## convert subsection titles
132
+ ## from <div id='field'>..</div>
133
+ ## to "unified" marker
134
+
135
+ ## e.g.
136
+ ## <div id='field' class='category'>Disputes - international:</div>
137
+
138
+ title_regex= /<div \s id='field'
139
+ \s class='category'>
140
+ \s*
141
+ (.+?) ## note: use non-greedy; allows tags inside - why? why not
142
+ \s*
143
+ <\/div>
144
+ /xim
145
+
146
+ html = html.gsub( title_regex ) do |m|
147
+ puts "** found subsection >#{$1}<:"
148
+ puts " >|#{m}|<"
149
+
150
+ "\n@SUBSECTION{#{$1}}\n"
151
+ end
152
+ html
153
+ end
154
+
155
+
156
+
157
+ def split_sects( html )
158
+ ####
159
+ # split html in sections (divided by section headings)
160
+ # e.g. remove optional prolog ??,
161
+ ## [[heading,sect],
162
+ ## [heading,sect],
163
+ ## [heading,sect],...]
164
+
165
+ ## note: "wrap" regex in a capture group (just one)
166
+ ## String#split will include all catpure groups in the result array
167
+
168
+ section_regex= /(@SECTION{.+?})/ ## note: use non-greedy -- check: need to escape {} ??
169
+
170
+ chunks = html.split( section_regex )
171
+
172
+ ## check if first item is a section or (html) prolog
173
+ # if prolog (remove)
174
+ chunks.slice!(0) unless chunks[0] =~ /@SECTION/ ## starts w/ @SECTION
175
+
176
+ pairs = chunks.each_slice(2).to_a
177
+
178
+ ## now split subsections
179
+ newpairs = []
180
+ pairs.each do |item|
181
+ ## todo: after cleanup prolog; remove @SECTION{} ?? - just keep title - why, why not??
182
+ newpairs << [item[0], split_subsects( item[1]) ]
183
+ end
184
+ newpairs
185
+ end
186
+
187
+
188
+ def split_subsects( html )
189
+ ####
190
+ # split html in subsections (divided by subsection headings)
191
+ # e.g. remove optional prolog ??,
192
+ ## [[heading,sect],
193
+ ## [heading,sect],
194
+ ## [heading,sect],...]
195
+
196
+ ## note: "wrap" regex in a capture group (just one)
197
+ ## String#split will include all catpure groups in the result array
198
+
199
+ subsection_regex= /(@SUBSECTION{.+?})/ ## note: use non-greedy -- check: need to escape {} ??
200
+
201
+ chunks = html.split( subsection_regex )
202
+
203
+ ## check if first item is a section or (html) prolog
204
+ # if prolog (remove)
205
+ chunks.slice!(0) unless chunks[0] =~ /@SUBSECTION/ ## starts w/ @SUBSECTION
206
+
207
+ pairs = chunks.each_slice(2).to_a
208
+ pairs
209
+ end
210
+
211
+ end # class Builder
212
+
213
+
214
+ end # module Factbook
@@ -0,0 +1,93 @@
1
+ # encoding: utf-8
2
+
3
+ module Factbook
4
+
5
+ class ItemBuilder ## renameto ItemReader, ItemParser - why? why not??
6
+ include LogUtils::Logging
7
+
8
+ def initialize( html, name )
9
+ @html = html
10
+ @name = name # add category/field name e.g. Area, Location, etc.
11
+ end
12
+
13
+ def read
14
+ ## return hash from html snippet
15
+ doc = Nokogiri::HTML.fragment( @html )
16
+
17
+ data = {}
18
+ last_node = nil ## track last hash (always use text key)
19
+ last_node_data_count = 0
20
+
21
+ ## note:
22
+ ## skip whitespace text nodes (e.g. \n\n etc); just use divs
23
+ doc.children.filter('div').each_with_index do |child,i|
24
+
25
+ if child['class'] == 'category_data'
26
+ text = child.text ## fix/todo: use strip
27
+ puts "category_data: >#{text}<"
28
+
29
+ if last_node.nil?
30
+ ## assume its the very first entry; use implied/auto-created category
31
+ data['text'] = ''
32
+ last_node = data
33
+ last_node_data_count = 0
34
+ end
35
+
36
+ ### first category_data element?
37
+ if last_node_data_count == 0
38
+ if last_node['text'] == ''
39
+ last_node['text'] = text
40
+ else ### possible ??? if data_count is zero - not should not include any data
41
+ ## todo: issue warning here - why? why not??
42
+ last_node['text'] += " #{text}" ## append w/o separator
43
+ end
44
+ else
45
+ if @name == 'demographic_profile' || @name == 'Demographic profile' ## special case (use space a sep)
46
+ last_node['text'] += " #{text}" ## append without (w/o) separator
47
+ else
48
+ last_node['text'] += " ++ #{text}" ## append with ++ separator
49
+ end
50
+ end
51
+ last_node_data_count += 1
52
+
53
+ elsif child['class'].nil? ## div without any class e.g. <div>..</div>
54
+ ## assume category and category_data pair w/ spans
55
+ spans = child.children.filter('span')
56
+ if spans.size > 2
57
+ puts "*** warn: expected two (or one) spans; got #{spans.inspect}"
58
+ end
59
+
60
+ ## pp spans
61
+
62
+ span_key = spans[0] ## assume 1st entry is span.category
63
+ span_value = spans[1] ## assume 2nd entry is span.category_data')
64
+ ## allow optional category_data for now
65
+ key = span_key.text
66
+
67
+ key = key.strip
68
+ key = key.sub( /:\z/, '' ) # remove trailing : if present
69
+ key = key.strip
70
+
71
+ value = span_value ? span_value.text : nil
72
+
73
+ puts "key: >#{key}<, value: >#{value}< : #{value.class.name}"
74
+
75
+ ## start new pair
76
+ last_node = data[key] = { 'text' => value }
77
+ last_node_data_count = value ? 1 : 0 ## note: set to 1 if value present
78
+ else
79
+ puts "*** warn: item builder -- unknow css class in #{child.inspect}"
80
+ end
81
+
82
+ ## pp child
83
+ ## css = child['class']
84
+ ## puts "[#{i}] #{child.name} class='>#{css}< : #{css.class.name}' >#{child.text}<"
85
+ end
86
+
87
+ pp data
88
+ data
89
+ end
90
+
91
+ end # class ItemBuilder
92
+
93
+ end # module Factbook
@@ -0,0 +1,119 @@
1
+ # encoding: utf-8
2
+
3
+ ##
4
+ # note:
5
+ # the factbook category/region for world is other entities (on FAQ) and oceans in page
6
+ # changed to world
7
+
8
+
9
+ module Factbook
10
+
11
+ class Codes
12
+
13
+ Code = Struct.new( :code, ## todo: add notes (country affiliation) - why? why not??
14
+ :name,
15
+ :category, ## e.g. Countries, Other, Oceans, World, Dependencies, etc.
16
+ :region, ## e.g. Europe, Oceans, etc.
17
+ )
18
+
19
+ def self.from_csv( path )
20
+ ###
21
+ # note:
22
+ # if you use quotes - NO leading spaces allowed e.g.
23
+ # use au,"Austria",... and NOT
24
+ # au, "Austria", ...
25
+ #
26
+ # for headers - NO leading spaces allowed e.g.
27
+ # use Code,Name,Category,Region,... and NOT
28
+ # Code, Name, Category, Region, ...
29
+
30
+ rows = CSV.read( path, headers: true )
31
+
32
+ pp rows
33
+
34
+ recs = []
35
+ rows.each do |row|
36
+ pp row
37
+ rec = Code.new
38
+ rec.code = row['Code'].strip ## remove leading n trailing whitespaces
39
+ rec.name = row['Name'].strip
40
+
41
+ ## note: for now category and region are optional
42
+ rec.category = row['Category'].strip if row['Category']
43
+ rec.region = row['Region'].strip if row['Region']
44
+
45
+ pp rec
46
+ recs << rec
47
+ end
48
+
49
+ self.new( recs )
50
+ end
51
+
52
+ def initialize( codes )
53
+ @codes = codes
54
+ end
55
+
56
+ def size() @codes.size; end
57
+
58
+ def each
59
+ @codes.each {|code| yield( code ) }
60
+ end
61
+
62
+ def to_a
63
+ @codes.collect {|code| code.code } ## return array of codes
64
+ end
65
+
66
+ ## def all() self.to_a; end ## note: alias for to_a - use - why? why not??
67
+
68
+ ## "pre-defined" convenience shortcuts
69
+ def countries() category 'Countries'; end
70
+ def world() category 'World'; end
71
+ def oceans() category 'Oceans'; end
72
+ def misc() category 'Miscellaneous'; end
73
+ def others() category 'Other'; end
74
+ def dependencies() category 'Dependencies'; end
75
+ def dependencies_us() category 'Dependencies (United States)'; end
76
+ ## fix/todo: add all dependencies uk (or gb?), fr,cn,au,nz,no,dk,etc.
77
+
78
+ def europe() region 'Europe'; end
79
+ def south_asia() region 'South Asia'; end
80
+ def central_asia() region 'Central Asia'; end
81
+ def east_n_souteast_asia() region 'East & Southeast Asia'; end
82
+ def middle_east() region 'Middle East'; end
83
+ def africa() region 'Africa'; end
84
+ def north_america() region 'North America'; end
85
+ def central_america_n_caribbean() region 'Central America and Caribbean'; end
86
+ def south_america() region 'South America'; end
87
+ def australia_oceania() region 'Australia-Oceania'; end
88
+ def antartica() region 'Antarctica'; end
89
+
90
+ ## note: regions oceans and world - same as category oceans and world
91
+ ## use oceans_ii or world_ii or something ??
92
+ ## use category('World') n region('World')
93
+ ## use category('Oceans') n region('Oceans')
94
+
95
+
96
+ def category( query )
97
+ ## todo/future: allow passing in of regex too (not just string)
98
+ ## note: e.g. Dependencies (France) needs to get escpaed to
99
+ ## Dependencies \(France\) etc.
100
+ filter_regex = /#{Regexp.escape(query)}/i
101
+ codes = @codes.select do |code|
102
+ code.category ? filter_regex.match( code.category ) : false ## note: allow nil for category; will fail on search
103
+ end
104
+ Codes.new( codes ) ## return new Codes obj for easy-chaining
105
+ end
106
+
107
+ def region( query )
108
+ ## todo/future: allow passing in of regex too (not just string)
109
+ filter_regex = /#{Regexp.escape(query)}/i
110
+ codes = @codes.select do |code|
111
+ code.region ? filter_regex.match( code.region ) : false ## note: allow nil for region; will fail on search
112
+ end
113
+ Codes.new( codes ) ## return new Codes obj for easy-chaining
114
+ end
115
+
116
+ end # class codes
117
+
118
+ end # module Factbook
119
+
@@ -0,0 +1,50 @@
1
+ # encoding: utf-8
2
+
3
+ module Factbook
4
+
5
+ class Comparisons
6
+
7
+ Comparison = Struct.new( :num, ### todo: use no or id or something - why? why not?
8
+ :category, ## e.g. Geography, People, Economy, etc.
9
+ :name,
10
+ )
11
+
12
+ def self.from_csv( path )
13
+
14
+ rows = CSV.read( path, headers: true )
15
+
16
+ pp rows
17
+
18
+ recs = []
19
+ rows.each do |row|
20
+ pp row
21
+ rec = Comparison.new
22
+ rec.num = row['Num'].strip.to_i ## remove leading n trailing whitespaces
23
+ rec.category = row['Category'].strip
24
+ rec.name = row['Name'].strip
25
+
26
+ pp rec
27
+ recs << rec
28
+ end
29
+
30
+ self.new( recs )
31
+ end
32
+
33
+ def initialize( comps )
34
+ @comps = comps
35
+ end
36
+
37
+ def size() @comps.size; end
38
+
39
+ def each
40
+ @comps.each {|comp| yield( comp ) }
41
+ end
42
+
43
+ def to_a
44
+ @comps.collect {|comp| comp.num } ## return array of nums -- return something else - why? why not?
45
+ end
46
+
47
+ end # class Comparison
48
+
49
+ end # module Factbook
50
+