factbook 0.1.3 → 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (46) hide show
  1. checksums.yaml +7 -0
  2. data/Manifest.txt +34 -22
  3. data/README.md +8 -3
  4. data/Rakefile +2 -263
  5. data/data/codes.csv +262 -0
  6. data/data/comparisons.csv +75 -0
  7. data/lib/factbook/builder.rb +214 -0
  8. data/lib/factbook/builder_item.rb +93 -0
  9. data/lib/factbook/codes.rb +119 -0
  10. data/lib/factbook/comparisons.rb +50 -0
  11. data/lib/factbook/page.rb +103 -303
  12. data/lib/factbook/sanitizer.rb +214 -0
  13. data/lib/factbook/sect.rb +29 -196
  14. data/lib/factbook/subsect.rb +18 -0
  15. data/lib/factbook/table.rb +52 -0
  16. data/lib/factbook/utils.rb +85 -0
  17. data/lib/factbook/utils_info.rb +102 -0
  18. data/lib/factbook/version.rb +4 -3
  19. data/lib/factbook.rb +23 -1
  20. data/test/data/au.html +579 -0
  21. data/test/data/au.yml +8 -0
  22. data/test/data/be.html +596 -0
  23. data/test/data/be.yml +8 -0
  24. data/test/data/src/au.html +2006 -0
  25. data/test/data/src/be.html +2011 -0
  26. data/test/helper.rb +0 -4
  27. data/test/test_builder.rb +37 -0
  28. data/test/test_codes.rb +76 -0
  29. data/test/test_comparisons.rb +19 -0
  30. data/test/test_fields.rb +21 -18
  31. data/test/test_item_builder.rb +99 -0
  32. data/test/test_json.rb +17 -20
  33. data/test/test_page.rb +18 -10
  34. data/test/test_sanitizer.rb +35 -0
  35. metadata +68 -49
  36. data/.gemtest +0 -0
  37. data/test/data/countrytemplate_au.html +0 -4179
  38. data/test/data/countrytemplate_be.html +0 -4260
  39. data/test/data/countrytemplate_br.html +0 -4366
  40. data/test/data/countrytemplate_ee.html +0 -2999
  41. data/test/data/countrytemplate_ls.html +0 -2728
  42. data/test/data/countrytemplate_mx.html +0 -4397
  43. data/test/data/countrytemplate_vt.html +0 -1726
  44. data/test/data/countrytemplate_xx.html +0 -2898
  45. data/test/test_page_old.rb +0 -478
  46. data/test/test_strip.rb +0 -66
@@ -0,0 +1,75 @@
1
+ Num,Category,Name
2
+ 2147,Geography,Area
3
+ 2119,People and Society,Population
4
+ 2002,People and Society,Population growth rate
5
+ 2054,People and Society,Birth rate
6
+ 2066,People and Society,Death rate
7
+ 2112,People and Society,Net migration rate
8
+ 2223,People and Society,Maternal mortality rate
9
+ 2091,People and Society,Infant mortality rate
10
+ 2102,People and Society,Life expectancy at birth
11
+ 2127,People and Society,Total fertility rate
12
+ 2225,People and Society,Health expenditures
13
+ 2155,People and Society,HIV/AIDS - adult prevalence rate
14
+ 2156,People and Society,HIV/AIDS - people living with HIV/AIDS
15
+ 2157,People and Society,HIV/AIDS - deaths
16
+ 2228,People and Society,Obesity - adult prevalence rate
17
+ 2224,People and Society,Children under the age of 5 years underweight
18
+ 2206,People and Society,Education expenditures
19
+ 2229,People and Society,"Unemployment, youth ages 15-24"
20
+ 2001,Economy,GDP (purchasing power parity)
21
+ 2003,Economy,GDP - real growth rate
22
+ 2004,Economy,GDP - per capita (PPP)
23
+ 2260,Economy,Gross national saving
24
+ 2089,Economy,Industrial production growth rate
25
+ 2095,Economy,Labor force
26
+ 2129,Economy,Unemployment rate
27
+ 2172,Economy,Distribution of family income - Gini index
28
+ 2221,Economy,Taxes and other revenues
29
+ 2222,Economy,Budget surplus (+) or deficit (-)
30
+ 2186,Economy,Public debt
31
+ 2092,Economy,Inflation rate (consumer prices)
32
+ 2207,Economy,Central bank discount rate
33
+ 2208,Economy,Commercial bank prime lending rate
34
+ 2214,Economy,Stock of narrow money
35
+ 2215,Economy,Stock of broad money
36
+ 2211,Economy,Stock of domestic credit
37
+ 2200,Economy,Market value of publicly traded shares
38
+ 2187,Economy,Current account balance
39
+ 2078,Economy,Exports
40
+ 2087,Economy,Imports
41
+ 2188,Economy,Reserves of foreign exchange and gold
42
+ 2079,Economy,Debt - external
43
+ 2198,Economy,Stock of direct foreign investment - at home
44
+ 2199,Economy,Stock of direct foreign investment - abroad
45
+ 2232,Energy,Electricity - production
46
+ 2233,Energy,Electricity - consumption
47
+ 2234,Energy,Electricity - exports
48
+ 2235,Energy,Electricity - imports
49
+ 2236,Energy,Electricity - installed generating capacity
50
+ 2237,Energy,Electricity - from fossil fuels
51
+ 2239,Energy,Electricity - from nuclear fuels
52
+ 2238,Energy,Electricity - from hydroelectric plants
53
+ 2240,Energy,Electricity - from other renewable sources
54
+ 2241,Energy,Crude oil - production
55
+ 2242,Energy,Crude oil - exports
56
+ 2243,Energy,Crude oil - imports
57
+ 2244,Energy,Crude oil - proved reserves
58
+ 2245,Energy,Refined petroleum products - production
59
+ 2246,Energy,Refined petroleum products - consumption
60
+ 2247,Energy,Refined petroleum products - exports
61
+ 2248,Energy,Refined petroleum products - imports
62
+ 2249,Energy,Natural gas - production
63
+ 2250,Energy,Natural gas - consumption
64
+ 2251,Energy,Natural gas - exports
65
+ 2252,Energy,Natural gas - imports
66
+ 2253,Energy,Natural gas - proved reserves
67
+ 2150,Communications,Telephones - fixed lines
68
+ 2151,Communications,Telephones - mobile cellular
69
+ 2153,Communications,Internet users
70
+ 2053,Transportation,Airports
71
+ 2121,Transportation,Railways
72
+ 2085,Transportation,Roadways
73
+ 2093,Transportation,Waterways
74
+ 2108,Transportation,Merchant marine
75
+ 2034,Military,Military expenditures
@@ -0,0 +1,214 @@
1
+ # encoding: utf-8
2
+
3
+ module Factbook
4
+
5
+ class Builder ## todo: change to PageBuilder ???
6
+ include LogUtils::Logging
7
+
8
+
9
+ =begin
10
+ def self.from_cc( cc, opts={} ) ## rename to from_file_for_country() or from_file_for_cc() or something - why?? why not??
11
+ ## check/todo: rename input_dir to just dir or to include ?
12
+ ## (there's no output_dir)?? - why? why not?
13
+ input_dir = opts[:input_dir] || '.'
14
+ self.from_file( "#{input_dir}/#{cc}.html" )
15
+ end
16
+ =end
17
+
18
+
19
+ def self.from_file( path )
20
+ html_ascii = File.read( path ) ## fix/todo: use ASCII8BIT/binary reader !!!!!
21
+ self.new( html_ascii )
22
+ end
23
+
24
+
25
+ attr_reader :html_ascii, ## full "original" 1:1 page in "original/ascii8/binary" encoding
26
+ :html, ## utf-8 encoded profile
27
+ :html_debug, ## html w/ mapping markers - rename to html_markers - why? why not?
28
+ :page_info, ## incl. country_name, region_name, last_updated etc.
29
+ :errors, ## encoding erros etc.
30
+ :page
31
+
32
+ def initialize( html_ascii )
33
+ @html_ascii = html_ascii
34
+
35
+ ## todo/fix: use/assume windows 12xx?? encoding - change encoding to utf-8 (from binary/ascii8bit)
36
+ @html, @page_info, @errors = Sanitizer.new.sanitize( @html_ascii )
37
+
38
+ @html_debug = map_sects( @html )
39
+ @html_debug = map_subsects( @html_debug )
40
+
41
+ html_sects = split_sects( @html_debug )
42
+ pp html_sects
43
+
44
+
45
+ page = Page.new
46
+ sects = []
47
+ html_sects.each do |html_sect|
48
+ html_sect_head = html_sect[0]
49
+ html_subsects = html_sect[1]
50
+ puts html_sect_head
51
+ puts html_subsects.size
52
+
53
+ ## get section title
54
+ ## @SECTION{Economy} => Economy
55
+ if html_sect_head =~ /@SECTION{(.+?)}/
56
+ title = $1.strip
57
+ puts title
58
+ sect = Sect.new
59
+ sect.title = title
60
+ ## get subsections
61
+ subsects = []
62
+ html_subsects.each do |html_subsect|
63
+ html_subsect_head = html_subsect[0]
64
+ html_subsect_body = html_subsect[1]
65
+ if html_subsect_head =~ /@SUBSECTION{(.+?)}/
66
+ title = $1.strip
67
+ title = title.sub( /:\z/, '' ) # remove trailing : if present
68
+ title = title.strip
69
+
70
+ puts title
71
+ subsect = Subsect.new
72
+ subsect.title = title ## todo/fix: cut off trailing colon (:)
73
+
74
+ b = Factbook::ItemBuilder.new( html_subsect_body, title )
75
+ h = b.read
76
+ subsect.data = h
77
+
78
+ subsects << subsect
79
+ else
80
+ ## warn/fix: no subsection title found
81
+ end
82
+ end
83
+ sect.subsects = subsects
84
+ sects << sect
85
+ else
86
+ ## warn/fix: no section title found
87
+ end
88
+ end
89
+ page.sects = sects
90
+ @page = page
91
+
92
+ pp page
93
+
94
+ self ## return self -- needed?? default (standard) anyway?? check and remove
95
+ end
96
+
97
+
98
+
99
+ def map_sects( html )
100
+ ## convert section titles
101
+ ## from <h2>..</h2>
102
+ ## to "unified" marker
103
+
104
+ ## e.g.
105
+ ## <h2 sectiontitle='Introduction' ccode='au'>Introduction :: <span class='region'>AUSTRIA </span></h2>
106
+ ## <h2>Introduction</h2>
107
+
108
+ title_regex= /<h2
109
+ (?:\s[^>]+)? ## allow optional attributes in h2
110
+ >
111
+ \s*
112
+ ([^<>]+?) ## note: use non-greedy; do NOT allow tags inside for now
113
+ \s*
114
+ (?:\s::\s
115
+ .+? ## note: use non-greedy; allows tags inside
116
+ )? ## strip optional name (e.g. :: AUSTRIA)
117
+ <\/h2>
118
+ /xim
119
+
120
+ html = html.gsub( title_regex ) do |m|
121
+ puts "** found section >#{$1}<:"
122
+ puts " >|#{m}|<"
123
+
124
+ "\n\n@SECTION{#{$1}}\n\n"
125
+ end
126
+ html
127
+ end
128
+
129
+
130
+ def map_subsects( html )
131
+ ## convert subsection titles
132
+ ## from <div id='field'>..</div>
133
+ ## to "unified" marker
134
+
135
+ ## e.g.
136
+ ## <div id='field' class='category'>Disputes - international:</div>
137
+
138
+ title_regex= /<div \s id='field'
139
+ \s class='category'>
140
+ \s*
141
+ (.+?) ## note: use non-greedy; allows tags inside - why? why not
142
+ \s*
143
+ <\/div>
144
+ /xim
145
+
146
+ html = html.gsub( title_regex ) do |m|
147
+ puts "** found subsection >#{$1}<:"
148
+ puts " >|#{m}|<"
149
+
150
+ "\n@SUBSECTION{#{$1}}\n"
151
+ end
152
+ html
153
+ end
154
+
155
+
156
+
157
+ def split_sects( html )
158
+ ####
159
+ # split html in sections (divided by section headings)
160
+ # e.g. remove optional prolog ??,
161
+ ## [[heading,sect],
162
+ ## [heading,sect],
163
+ ## [heading,sect],...]
164
+
165
+ ## note: "wrap" regex in a capture group (just one)
166
+ ## String#split will include all catpure groups in the result array
167
+
168
+ section_regex= /(@SECTION{.+?})/ ## note: use non-greedy -- check: need to escape {} ??
169
+
170
+ chunks = html.split( section_regex )
171
+
172
+ ## check if first item is a section or (html) prolog
173
+ # if prolog (remove)
174
+ chunks.slice!(0) unless chunks[0] =~ /@SECTION/ ## starts w/ @SECTION
175
+
176
+ pairs = chunks.each_slice(2).to_a
177
+
178
+ ## now split subsections
179
+ newpairs = []
180
+ pairs.each do |item|
181
+ ## todo: after cleanup prolog; remove @SECTION{} ?? - just keep title - why, why not??
182
+ newpairs << [item[0], split_subsects( item[1]) ]
183
+ end
184
+ newpairs
185
+ end
186
+
187
+
188
+ def split_subsects( html )
189
+ ####
190
+ # split html in subsections (divided by subsection headings)
191
+ # e.g. remove optional prolog ??,
192
+ ## [[heading,sect],
193
+ ## [heading,sect],
194
+ ## [heading,sect],...]
195
+
196
+ ## note: "wrap" regex in a capture group (just one)
197
+ ## String#split will include all catpure groups in the result array
198
+
199
+ subsection_regex= /(@SUBSECTION{.+?})/ ## note: use non-greedy -- check: need to escape {} ??
200
+
201
+ chunks = html.split( subsection_regex )
202
+
203
+ ## check if first item is a section or (html) prolog
204
+ # if prolog (remove)
205
+ chunks.slice!(0) unless chunks[0] =~ /@SUBSECTION/ ## starts w/ @SUBSECTION
206
+
207
+ pairs = chunks.each_slice(2).to_a
208
+ pairs
209
+ end
210
+
211
+ end # class Builder
212
+
213
+
214
+ end # module Factbook
@@ -0,0 +1,93 @@
1
+ # encoding: utf-8
2
+
3
+ module Factbook
4
+
5
+ class ItemBuilder ## renameto ItemReader, ItemParser - why? why not??
6
+ include LogUtils::Logging
7
+
8
+ def initialize( html, name )
9
+ @html = html
10
+ @name = name # add category/field name e.g. Area, Location, etc.
11
+ end
12
+
13
+ def read
14
+ ## return hash from html snippet
15
+ doc = Nokogiri::HTML.fragment( @html )
16
+
17
+ data = {}
18
+ last_node = nil ## track last hash (always use text key)
19
+ last_node_data_count = 0
20
+
21
+ ## note:
22
+ ## skip whitespace text nodes (e.g. \n\n etc); just use divs
23
+ doc.children.filter('div').each_with_index do |child,i|
24
+
25
+ if child['class'] == 'category_data'
26
+ text = child.text ## fix/todo: use strip
27
+ puts "category_data: >#{text}<"
28
+
29
+ if last_node.nil?
30
+ ## assume its the very first entry; use implied/auto-created category
31
+ data['text'] = ''
32
+ last_node = data
33
+ last_node_data_count = 0
34
+ end
35
+
36
+ ### first category_data element?
37
+ if last_node_data_count == 0
38
+ if last_node['text'] == ''
39
+ last_node['text'] = text
40
+ else ### possible ??? if data_count is zero - not should not include any data
41
+ ## todo: issue warning here - why? why not??
42
+ last_node['text'] += " #{text}" ## append w/o separator
43
+ end
44
+ else
45
+ if @name == 'demographic_profile' || @name == 'Demographic profile' ## special case (use space a sep)
46
+ last_node['text'] += " #{text}" ## append without (w/o) separator
47
+ else
48
+ last_node['text'] += " ++ #{text}" ## append with ++ separator
49
+ end
50
+ end
51
+ last_node_data_count += 1
52
+
53
+ elsif child['class'].nil? ## div without any class e.g. <div>..</div>
54
+ ## assume category and category_data pair w/ spans
55
+ spans = child.children.filter('span')
56
+ if spans.size > 2
57
+ puts "*** warn: expected two (or one) spans; got #{spans.inspect}"
58
+ end
59
+
60
+ ## pp spans
61
+
62
+ span_key = spans[0] ## assume 1st entry is span.category
63
+ span_value = spans[1] ## assume 2nd entry is span.category_data')
64
+ ## allow optional category_data for now
65
+ key = span_key.text
66
+
67
+ key = key.strip
68
+ key = key.sub( /:\z/, '' ) # remove trailing : if present
69
+ key = key.strip
70
+
71
+ value = span_value ? span_value.text : nil
72
+
73
+ puts "key: >#{key}<, value: >#{value}< : #{value.class.name}"
74
+
75
+ ## start new pair
76
+ last_node = data[key] = { 'text' => value }
77
+ last_node_data_count = value ? 1 : 0 ## note: set to 1 if value present
78
+ else
79
+ puts "*** warn: item builder -- unknow css class in #{child.inspect}"
80
+ end
81
+
82
+ ## pp child
83
+ ## css = child['class']
84
+ ## puts "[#{i}] #{child.name} class='>#{css}< : #{css.class.name}' >#{child.text}<"
85
+ end
86
+
87
+ pp data
88
+ data
89
+ end
90
+
91
+ end # class ItemBuilder
92
+
93
+ end # module Factbook
@@ -0,0 +1,119 @@
1
+ # encoding: utf-8
2
+
3
+ ##
4
+ # note:
5
+ # the factbook category/region for world is other entities (on FAQ) and oceans in page
6
+ # changed to world
7
+
8
+
9
+ module Factbook
10
+
11
+ class Codes
12
+
13
+ Code = Struct.new( :code, ## todo: add notes (country affiliation) - why? why not??
14
+ :name,
15
+ :category, ## e.g. Countries, Other, Oceans, World, Dependencies, etc.
16
+ :region, ## e.g. Europe, Oceans, etc.
17
+ )
18
+
19
+ def self.from_csv( path )
20
+ ###
21
+ # note:
22
+ # if you use quotes - NO leading spaces allowed e.g.
23
+ # use au,"Austria",... and NOT
24
+ # au, "Austria", ...
25
+ #
26
+ # for headers - NO leading spaces allowed e.g.
27
+ # use Code,Name,Category,Region,... and NOT
28
+ # Code, Name, Category, Region, ...
29
+
30
+ rows = CSV.read( path, headers: true )
31
+
32
+ pp rows
33
+
34
+ recs = []
35
+ rows.each do |row|
36
+ pp row
37
+ rec = Code.new
38
+ rec.code = row['Code'].strip ## remove leading n trailing whitespaces
39
+ rec.name = row['Name'].strip
40
+
41
+ ## note: for now category and region are optional
42
+ rec.category = row['Category'].strip if row['Category']
43
+ rec.region = row['Region'].strip if row['Region']
44
+
45
+ pp rec
46
+ recs << rec
47
+ end
48
+
49
+ self.new( recs )
50
+ end
51
+
52
+ def initialize( codes )
53
+ @codes = codes
54
+ end
55
+
56
+ def size() @codes.size; end
57
+
58
+ def each
59
+ @codes.each {|code| yield( code ) }
60
+ end
61
+
62
+ def to_a
63
+ @codes.collect {|code| code.code } ## return array of codes
64
+ end
65
+
66
+ ## def all() self.to_a; end ## note: alias for to_a - use - why? why not??
67
+
68
+ ## "pre-defined" convenience shortcuts
69
+ def countries() category 'Countries'; end
70
+ def world() category 'World'; end
71
+ def oceans() category 'Oceans'; end
72
+ def misc() category 'Miscellaneous'; end
73
+ def others() category 'Other'; end
74
+ def dependencies() category 'Dependencies'; end
75
+ def dependencies_us() category 'Dependencies (United States)'; end
76
+ ## fix/todo: add all dependencies uk (or gb?), fr,cn,au,nz,no,dk,etc.
77
+
78
+ def europe() region 'Europe'; end
79
+ def south_asia() region 'South Asia'; end
80
+ def central_asia() region 'Central Asia'; end
81
+ def east_n_souteast_asia() region 'East & Southeast Asia'; end
82
+ def middle_east() region 'Middle East'; end
83
+ def africa() region 'Africa'; end
84
+ def north_america() region 'North America'; end
85
+ def central_america_n_caribbean() region 'Central America and Caribbean'; end
86
+ def south_america() region 'South America'; end
87
+ def australia_oceania() region 'Australia-Oceania'; end
88
+ def antartica() region 'Antarctica'; end
89
+
90
+ ## note: regions oceans and world - same as category oceans and world
91
+ ## use oceans_ii or world_ii or something ??
92
+ ## use category('World') n region('World')
93
+ ## use category('Oceans') n region('Oceans')
94
+
95
+
96
+ def category( query )
97
+ ## todo/future: allow passing in of regex too (not just string)
98
+ ## note: e.g. Dependencies (France) needs to get escpaed to
99
+ ## Dependencies \(France\) etc.
100
+ filter_regex = /#{Regexp.escape(query)}/i
101
+ codes = @codes.select do |code|
102
+ code.category ? filter_regex.match( code.category ) : false ## note: allow nil for category; will fail on search
103
+ end
104
+ Codes.new( codes ) ## return new Codes obj for easy-chaining
105
+ end
106
+
107
+ def region( query )
108
+ ## todo/future: allow passing in of regex too (not just string)
109
+ filter_regex = /#{Regexp.escape(query)}/i
110
+ codes = @codes.select do |code|
111
+ code.region ? filter_regex.match( code.region ) : false ## note: allow nil for region; will fail on search
112
+ end
113
+ Codes.new( codes ) ## return new Codes obj for easy-chaining
114
+ end
115
+
116
+ end # class codes
117
+
118
+ end # module Factbook
119
+
@@ -0,0 +1,50 @@
1
+ # encoding: utf-8
2
+
3
+ module Factbook
4
+
5
+ class Comparisons
6
+
7
+ Comparison = Struct.new( :num, ### todo: use no or id or something - why? why not?
8
+ :category, ## e.g. Geography, People, Economy, etc.
9
+ :name,
10
+ )
11
+
12
+ def self.from_csv( path )
13
+
14
+ rows = CSV.read( path, headers: true )
15
+
16
+ pp rows
17
+
18
+ recs = []
19
+ rows.each do |row|
20
+ pp row
21
+ rec = Comparison.new
22
+ rec.num = row['Num'].strip.to_i ## remove leading n trailing whitespaces
23
+ rec.category = row['Category'].strip
24
+ rec.name = row['Name'].strip
25
+
26
+ pp rec
27
+ recs << rec
28
+ end
29
+
30
+ self.new( recs )
31
+ end
32
+
33
+ def initialize( comps )
34
+ @comps = comps
35
+ end
36
+
37
+ def size() @comps.size; end
38
+
39
+ def each
40
+ @comps.each {|comp| yield( comp ) }
41
+ end
42
+
43
+ def to_a
44
+ @comps.collect {|comp| comp.num } ## return array of nums -- return something else - why? why not?
45
+ end
46
+
47
+ end # class Comparison
48
+
49
+ end # module Factbook
50
+