factbook-readers 1.0.1 → 1.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (43) hide show
  1. checksums.yaml +5 -5
  2. data/Manifest.txt +3 -25
  3. data/README.md +11 -69
  4. data/Rakefile +3 -3
  5. data/lib/factbook-readers.rb +5 -40
  6. data/lib/factbook-readers/convert.rb +37 -0
  7. data/lib/factbook-readers/counter.rb +7 -9
  8. data/lib/factbook-readers/page.rb +41 -61
  9. data/lib/factbook-readers/page_info.rb +15 -3
  10. data/lib/factbook-readers/version.rb +2 -2
  11. data/test/helper.rb +3 -0
  12. data/test/test_counter.rb +9 -6
  13. data/test/test_download.rb +27 -0
  14. data/test/test_fields.rb +44 -27
  15. data/test/test_json.rb +4 -4
  16. data/test/test_page.rb +8 -8
  17. data/test/test_version.rb +15 -0
  18. metadata +11 -48
  19. data/data/categories.csv +0 -164
  20. data/data/codes.csv +0 -262
  21. data/data/codesxref.csv +0 -280
  22. data/data/comparisons.csv +0 -75
  23. data/lib/factbook-readers/builder.rb +0 -187
  24. data/lib/factbook-readers/builder_item.rb +0 -201
  25. data/lib/factbook-readers/builder_json.rb +0 -68
  26. data/lib/factbook-readers/codes.rb +0 -121
  27. data/lib/factbook-readers/comparisons.rb +0 -49
  28. data/lib/factbook-readers/normalize.rb +0 -42
  29. data/lib/factbook-readers/reader_json.rb +0 -50
  30. data/lib/factbook-readers/sanitizer.rb +0 -351
  31. data/lib/factbook-readers/sect.rb +0 -28
  32. data/lib/factbook-readers/subsect.rb +0 -17
  33. data/lib/factbook-readers/table.rb +0 -51
  34. data/lib/factbook-readers/utils.rb +0 -47
  35. data/lib/factbook-readers/utils_info.rb +0 -128
  36. data/test/test_builder.rb +0 -30
  37. data/test/test_codes.rb +0 -72
  38. data/test/test_comparisons.rb +0 -16
  39. data/test/test_item_builder.rb +0 -97
  40. data/test/test_json_builder.rb +0 -23
  41. data/test/test_normalize.rb +0 -21
  42. data/test/test_sanitizer.rb +0 -36
  43. data/test/test_sanitizer_regex.rb +0 -87
data/data/comparisons.csv DELETED
@@ -1,75 +0,0 @@
1
- Num,Category,Name
2
- 2147,Geography,Area
3
- 2119,People and Society,Population
4
- 2002,People and Society,Population growth rate
5
- 2054,People and Society,Birth rate
6
- 2066,People and Society,Death rate
7
- 2112,People and Society,Net migration rate
8
- 2223,People and Society,Maternal mortality rate
9
- 2091,People and Society,Infant mortality rate
10
- 2102,People and Society,Life expectancy at birth
11
- 2127,People and Society,Total fertility rate
12
- 2225,People and Society,Health expenditures
13
- 2155,People and Society,HIV/AIDS - adult prevalence rate
14
- 2156,People and Society,HIV/AIDS - people living with HIV/AIDS
15
- 2157,People and Society,HIV/AIDS - deaths
16
- 2228,People and Society,Obesity - adult prevalence rate
17
- 2224,People and Society,Children under the age of 5 years underweight
18
- 2206,People and Society,Education expenditures
19
- 2229,People and Society,"Unemployment, youth ages 15-24"
20
- 2001,Economy,GDP (purchasing power parity)
21
- 2003,Economy,GDP - real growth rate
22
- 2004,Economy,GDP - per capita (PPP)
23
- 2260,Economy,Gross national saving
24
- 2089,Economy,Industrial production growth rate
25
- 2095,Economy,Labor force
26
- 2129,Economy,Unemployment rate
27
- 2172,Economy,Distribution of family income - Gini index
28
- 2221,Economy,Taxes and other revenues
29
- 2222,Economy,Budget surplus (+) or deficit (-)
30
- 2186,Economy,Public debt
31
- 2092,Economy,Inflation rate (consumer prices)
32
- 2207,Economy,Central bank discount rate
33
- 2208,Economy,Commercial bank prime lending rate
34
- 2214,Economy,Stock of narrow money
35
- 2215,Economy,Stock of broad money
36
- 2211,Economy,Stock of domestic credit
37
- 2200,Economy,Market value of publicly traded shares
38
- 2187,Economy,Current account balance
39
- 2078,Economy,Exports
40
- 2087,Economy,Imports
41
- 2188,Economy,Reserves of foreign exchange and gold
42
- 2079,Economy,Debt - external
43
- 2198,Economy,Stock of direct foreign investment - at home
44
- 2199,Economy,Stock of direct foreign investment - abroad
45
- 2232,Energy,Electricity - production
46
- 2233,Energy,Electricity - consumption
47
- 2234,Energy,Electricity - exports
48
- 2235,Energy,Electricity - imports
49
- 2236,Energy,Electricity - installed generating capacity
50
- 2237,Energy,Electricity - from fossil fuels
51
- 2239,Energy,Electricity - from nuclear fuels
52
- 2238,Energy,Electricity - from hydroelectric plants
53
- 2240,Energy,Electricity - from other renewable sources
54
- 2241,Energy,Crude oil - production
55
- 2242,Energy,Crude oil - exports
56
- 2243,Energy,Crude oil - imports
57
- 2244,Energy,Crude oil - proved reserves
58
- 2245,Energy,Refined petroleum products - production
59
- 2246,Energy,Refined petroleum products - consumption
60
- 2247,Energy,Refined petroleum products - exports
61
- 2248,Energy,Refined petroleum products - imports
62
- 2249,Energy,Natural gas - production
63
- 2250,Energy,Natural gas - consumption
64
- 2251,Energy,Natural gas - exports
65
- 2252,Energy,Natural gas - imports
66
- 2253,Energy,Natural gas - proved reserves
67
- 2150,Communications,Telephones - fixed lines
68
- 2151,Communications,Telephones - mobile cellular
69
- 2153,Communications,Internet users
70
- 2053,Transportation,Airports
71
- 2121,Transportation,Railways
72
- 2085,Transportation,Roadways
73
- 2093,Transportation,Waterways
74
- 2108,Transportation,Merchant marine
75
- 2034,Military,Military expenditures
@@ -1,187 +0,0 @@
1
-
2
- module Factbook
3
-
4
- class Builder ## todo: change to HtmlBuilder or PageBuilder ???
5
- include LogUtils::Logging
6
-
7
-
8
-
9
- attr_reader :html_original, ## full "original" 1:1 page
10
- :html, ## cut-out and sanitized profile
11
- :html_debug, ## html w/ mapping markers - rename to html_markers - why? why not?
12
- :info, ## page info incl. country_name, region_name, last_updated etc.
13
- :errors, ## encoding erros etc.
14
- :sects
15
-
16
-
17
- def initialize( html_original )
18
- @html_original = html_original
19
-
20
- @html, @info, @errors = Sanitizer.new.sanitize( @html_original )
21
-
22
-
23
- html_sects = if @html.empty?
24
- ## note: support "empty" pages - old format waiting for update!!!
25
- ## cannot parse for now
26
- @html_debug = ''
27
- [] ## return empty (no) sections for now - sorry (its just one page with code cc anyway!!)
28
- else
29
- @html_debug = map_sects( @html )
30
- @html_debug = map_subsects( @html_debug )
31
-
32
- split_sects( @html_debug )
33
- end
34
-
35
- pp html_sects
36
-
37
- ## debug
38
- ## File.open( 'tmp/br.debug.html', 'w:utf-8') { |f| f.write( @html_debug ) }
39
-
40
-
41
- @sects = []
42
- html_sects.each do |html_sect|
43
- html_sect_head = html_sect[0]
44
- html_subsects = html_sect[1]
45
- puts html_sect_head
46
- puts html_subsects.size
47
-
48
- ## get section title
49
- ## @SECTION{Economy} => Economy
50
- if html_sect_head =~ /@SECTION{(.+?)}/
51
- title = $1.strip
52
- puts title
53
- sect = Sect.new
54
- sect.title = title
55
- ## get subsections
56
- subsects = []
57
- html_subsects.each do |html_subsect|
58
- html_subsect_head = html_subsect[0]
59
- html_subsect_body = html_subsect[1]
60
- if html_subsect_head =~ /@SUBSECTION{(.+?)}/
61
- title = $1.strip
62
- title = title.sub( /:\z/, '' ) # remove trailing : if present
63
- title = title.strip
64
-
65
- puts title
66
- subsect = Subsect.new
67
- subsect.title = title ## todo/fix: cut off trailing colon (:)
68
-
69
- b = Factbook::ItemBuilder.new( html_subsect_body, title )
70
- h = b.read
71
- subsect.data = h
72
-
73
- subsects << subsect
74
- else
75
- ## warn/fix: no subsection title found
76
- end
77
- end
78
- sect.subsects = subsects
79
- @sects << sect
80
- else
81
- ## warn/fix: no section title found
82
- end
83
- end
84
- end
85
-
86
-
87
- H2_RE = /<h2>
88
- \s*
89
- (.+?) ## note: use non-greedy; do NOT allow tags inside for now
90
- \s*
91
- <\/h2>
92
- /xim
93
-
94
- def map_sects( html )
95
- ## convert section titles to "unified" marker
96
- ## e.g.
97
- ## <h2>Introduction</h2>
98
-
99
- html = html.gsub( H2_RE ) do |m|
100
- puts "** found section >#{$1}<:"
101
- puts " >|#{m}|<"
102
-
103
- "\n\n@SECTION{#{$1}}\n\n"
104
- end
105
- html
106
- end
107
-
108
-
109
- H3_RE = /<h3>
110
- \s*
111
- (.+?) ## note: use non-greedy; allows tags inside - why? why not
112
- \s*
113
- <\/h3>
114
- /xim
115
-
116
- def map_subsects( html )
117
- ## convert subsection titles to "unified" marker
118
- ## e.g.
119
- ## <h3>Disputes - international:</h3>
120
-
121
- html = html.gsub( H3_RE ) do |m|
122
- puts "** found subsection >#{$1}<:"
123
- puts " >|#{m}|<"
124
-
125
- "\n@SUBSECTION{#{$1}}\n"
126
- end
127
- html
128
- end
129
-
130
-
131
-
132
- def split_sects( html )
133
- ####
134
- # split html in sections (divided by section headings)
135
- # e.g. remove optional prolog ??,
136
- ## [[heading,sect],
137
- ## [heading,sect],
138
- ## [heading,sect],...]
139
-
140
- ## note: "wrap" regex in a capture group (just one)
141
- ## String#split will include all catpure groups in the result array
142
-
143
- ## note: use non-greedy -- check: need to escape {} ??
144
- chunks = html.split( /(@SECTION{.+?})/ )
145
-
146
- ## check if first item is a section or (html) prolog
147
- # if prolog (remove)
148
- chunks.slice!(0) unless chunks[0] =~ /@SECTION/ ## starts w/ @SECTION
149
-
150
- pairs = chunks.each_slice(2).to_a
151
-
152
- ## now split subsections
153
- newpairs = []
154
- pairs.each do |item|
155
- ## todo: after cleanup prolog; remove @SECTION{} ?? - just keep title - why, why not??
156
- newpairs << [item[0], split_subsects( item[1]) ]
157
- end
158
- newpairs
159
- end
160
-
161
-
162
- def split_subsects( html )
163
- ####
164
- # split html in subsections (divided by subsection headings)
165
- # e.g. remove optional prolog ??,
166
- ## [[heading,sect],
167
- ## [heading,sect],
168
- ## [heading,sect],...]
169
-
170
- ## note: "wrap" regex in a capture group (just one)
171
- ## String#split will include all catpure groups in the result array
172
-
173
- ## note: use non-greedy -- check: need to escape {} ??
174
- chunks = html.split( /(@SUBSECTION{.+?})/ )
175
-
176
- ## check if first item is a section or (html) prolog
177
- # if prolog (remove)
178
- chunks.slice!(0) unless chunks[0] =~ /@SUBSECTION/ ## starts w/ @SUBSECTION
179
-
180
- pairs = chunks.each_slice(2).to_a
181
- pairs
182
- end
183
-
184
- end # class Builder
185
-
186
-
187
- end # module Factbook
@@ -1,201 +0,0 @@
1
-
2
- module Factbook
3
-
4
- class ItemBuilder ## renameto ItemReader, ItemParser - why? why not??
5
- include LogUtils::Logging
6
- include NormalizeHelper ## e.g. normalize_category
7
-
8
- def initialize( html, name )
9
- @html = html
10
- @name = name # add category/field name e.g. Area, Location, etc.
11
- end
12
-
13
-
14
-
15
- ##
16
- ## <div class="category_data subfield text">
17
- ## Portuguese (official and most widely spoken language)
18
- ##
19
- ## </div>
20
- ## <div class="category_data note">
21
- ## <p><strong>note:</strong> less common languages include Spanish (border areas and schools), German, Italian, Japanese, English, and a large number of minor Amerindian languages</p>
22
- ## </div>
23
-
24
-
25
- def read
26
- ## return hash from html snippet
27
- doc = Nokogiri::HTML.fragment( @html )
28
-
29
- data = {}
30
-
31
- ## note:
32
- ## skip whitespace text nodes (e.g. \n\n etc); just use divs
33
- doc_children = doc.children.filter('div')
34
-
35
- puts " parsing >#{@name}< - #{doc_children.size} category_data divs(s):"
36
-
37
- ## hanlde special case for
38
- ## multiple 'grouped_subfield' first
39
- ## e.g. used in
40
- ## - Drinking water source:
41
- ## - Sanitation facility access:
42
-
43
- grouped_children = []
44
- other_children = []
45
-
46
- doc_children.each do |div|
47
- if div['class'] && div['class'].index( 'grouped_subfield' )
48
- grouped_children << div
49
- else
50
- other_children << div
51
- end
52
- end
53
-
54
-
55
- ## note: only use special rule if more than one div marked grouped_
56
- if grouped_children.size > 1
57
- ## continue processing the rest as usual
58
- doc_children = other_children
59
-
60
- key = nil
61
- grouped_children.each do |div|
62
- if !div.css( 'span.subfield-group').empty?
63
- # start a new group
64
- span_group = div.at( 'span.subfield-group')
65
- key = normalize_category( span_group.text.strip )
66
- span_group.replace( '' )
67
-
68
- text = squish( div.text.strip )
69
- puts "new group - category_data key >#{key}<: >#{text}<"
70
- data[ key ] = { 'text' => text }
71
- else
72
- ## append to (last) group
73
- text = squish( div.text.strip )
74
- puts "add group - category_data key >#{key}<: >#{text}<"
75
- data[ key ]['text'] += " / #{text}"
76
- end
77
- end
78
- end
79
-
80
-
81
- doc_children.each_with_index do |div,i|
82
- if div['class'] && div['class'].index( 'category_data' )
83
- if div['class'].index( 'note' )
84
- text = squish( div.text.strip )
85
- puts "category_data: >#{text}<"
86
-
87
- ## note: for now only allow one note per subsection/field data block
88
- if data['note']
89
- puts "!! ERROR: note already taken:"
90
- puts data['note']
91
- puts div.to_html
92
- exit 1
93
- end
94
-
95
- ## note: add note directly (that is, W/O extra hash and text node/key)
96
- data['note'] = text
97
- elsif div['class'].index( 'historic' )
98
- ## add all historic together into one for now
99
- text = squish( div.text.strip )
100
- puts "category_data: >#{text}<"
101
-
102
- if data['text']
103
- ## append with / for now
104
- data['text'] += " / #{text}"
105
- else
106
- data['text'] = text
107
- ## check if history is first node
108
- if i != 0
109
- puts "!! ERROR: expected first historic node to be first node but it is #{i+1}:"
110
- puts div.to_html
111
- exit 1
112
- end
113
- end
114
- elsif div.css( 'span.subfield-name').empty?
115
- ## assume "implied text field"
116
- ## check for index == 1 / child count == 1 - why? why not
117
- text = squish( div.text.strip ) ## fix/todo: use strip
118
- puts "category_data: >#{text}<"
119
-
120
- data['text'] = text
121
-
122
- ## must be always first node for now
123
- if i != 0
124
- puts "!! ERROR - 'implied' category W/O name NOT first div / node:"
125
- puts div.to_html
126
- exit 1
127
- end
128
- elsif div['class'].index( 'grouped_subfield' )
129
- ## split grouped subfield!!
130
- ## <span class="subfield-name">arable land:</span>
131
- ## <span class="subfield-number">8.6%</span>
132
- ## <span class="subfield-date">(2011 est.)</span>
133
- ## /
134
- ## <span class="subfield-name">permanent crops:</span>
135
- ## <span class="subfield-number">0.8%</span>
136
- ## <span class="subfield-date">(2011 est.)</span>
137
- ## /
138
- ## <span class="subfield-name">permanent pasture:</span>
139
- ## <span class="subfield-number">23.5%</span>
140
- ## <span class="subfield-date">(2011 est.)</span>
141
-
142
- ## join names for now - why? why not?
143
- ## e.g. becomes:
144
- ## arable land / permanent crops / permanent pasture: for key ??
145
- span_names = div.css( 'span.subfield-name')
146
- keys = []
147
- span_names.each do |span|
148
- keys << normalize_category( span.text.strip )
149
- span.replace( '' )
150
- end
151
- key = keys.join( ' / ')
152
- text = squish( div.text.strip )
153
- puts "category_data key >#{key}<: >#{text}<"
154
- data[ key ] = { 'text' => text }
155
- else
156
- ## get subfield name
157
- span_names = div.css( 'span.subfield-name')
158
- if span_names.size > 1
159
- puts "!! ERROR - found more than one subfield-name:"
160
- puts div.to_html
161
- exit 1
162
- end
163
- key = normalize_category( span_names[0].text.strip )
164
- span_names[0].replace( '' )
165
-
166
- text = squish( div.text.strip )
167
- puts "category_data key >#{key}<: >#{text}<"
168
- data[ key ] = { 'text' => text }
169
- end
170
- else
171
- text = squish( div.text.strip )
172
- if text =~ /country\s+
173
- comparison\s+
174
- to\s+
175
- the\s+
176
- world:\s+
177
- ([0-9]+)/xim
178
- data[ 'country comparison to the world' ] = $1.to_i
179
- else
180
- puts "!! ERROR: div (W/O category_data class):"
181
- puts div.to_html
182
- exit 1
183
- end
184
- end
185
- end
186
-
187
-
188
- pp data
189
- data
190
- end
191
-
192
-
193
-
194
-
195
- def squish( str )
196
- str.gsub( /[ \t\n\r]{2,}/, ' ') ## replace multi-spaces (incl. newlines with once space)
197
- end
198
-
199
- end # class ItemBuilder
200
-
201
- end # module Factbook