factbook-readers 1.0.1 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. checksums.yaml +5 -5
  2. data/Manifest.txt +3 -25
  3. data/README.md +11 -69
  4. data/Rakefile +3 -3
  5. data/lib/factbook-readers.rb +5 -40
  6. data/lib/factbook-readers/convert.rb +37 -0
  7. data/lib/factbook-readers/counter.rb +7 -9
  8. data/lib/factbook-readers/page.rb +41 -61
  9. data/lib/factbook-readers/page_info.rb +15 -3
  10. data/lib/factbook-readers/version.rb +2 -2
  11. data/test/helper.rb +3 -0
  12. data/test/test_counter.rb +9 -6
  13. data/test/test_download.rb +27 -0
  14. data/test/test_fields.rb +44 -27
  15. data/test/test_json.rb +4 -4
  16. data/test/test_page.rb +8 -8
  17. data/test/test_version.rb +15 -0
  18. metadata +11 -48
  19. data/data/categories.csv +0 -164
  20. data/data/codes.csv +0 -262
  21. data/data/codesxref.csv +0 -280
  22. data/data/comparisons.csv +0 -75
  23. data/lib/factbook-readers/builder.rb +0 -187
  24. data/lib/factbook-readers/builder_item.rb +0 -201
  25. data/lib/factbook-readers/builder_json.rb +0 -68
  26. data/lib/factbook-readers/codes.rb +0 -121
  27. data/lib/factbook-readers/comparisons.rb +0 -49
  28. data/lib/factbook-readers/normalize.rb +0 -42
  29. data/lib/factbook-readers/reader_json.rb +0 -50
  30. data/lib/factbook-readers/sanitizer.rb +0 -351
  31. data/lib/factbook-readers/sect.rb +0 -28
  32. data/lib/factbook-readers/subsect.rb +0 -17
  33. data/lib/factbook-readers/table.rb +0 -51
  34. data/lib/factbook-readers/utils.rb +0 -47
  35. data/lib/factbook-readers/utils_info.rb +0 -128
  36. data/test/test_builder.rb +0 -30
  37. data/test/test_codes.rb +0 -72
  38. data/test/test_comparisons.rb +0 -16
  39. data/test/test_item_builder.rb +0 -97
  40. data/test/test_json_builder.rb +0 -23
  41. data/test/test_normalize.rb +0 -21
  42. data/test/test_sanitizer.rb +0 -36
  43. data/test/test_sanitizer_regex.rb +0 -87
data/data/comparisons.csv DELETED
@@ -1,75 +0,0 @@
1
- Num,Category,Name
2
- 2147,Geography,Area
3
- 2119,People and Society,Population
4
- 2002,People and Society,Population growth rate
5
- 2054,People and Society,Birth rate
6
- 2066,People and Society,Death rate
7
- 2112,People and Society,Net migration rate
8
- 2223,People and Society,Maternal mortality rate
9
- 2091,People and Society,Infant mortality rate
10
- 2102,People and Society,Life expectancy at birth
11
- 2127,People and Society,Total fertility rate
12
- 2225,People and Society,Health expenditures
13
- 2155,People and Society,HIV/AIDS - adult prevalence rate
14
- 2156,People and Society,HIV/AIDS - people living with HIV/AIDS
15
- 2157,People and Society,HIV/AIDS - deaths
16
- 2228,People and Society,Obesity - adult prevalence rate
17
- 2224,People and Society,Children under the age of 5 years underweight
18
- 2206,People and Society,Education expenditures
19
- 2229,People and Society,"Unemployment, youth ages 15-24"
20
- 2001,Economy,GDP (purchasing power parity)
21
- 2003,Economy,GDP - real growth rate
22
- 2004,Economy,GDP - per capita (PPP)
23
- 2260,Economy,Gross national saving
24
- 2089,Economy,Industrial production growth rate
25
- 2095,Economy,Labor force
26
- 2129,Economy,Unemployment rate
27
- 2172,Economy,Distribution of family income - Gini index
28
- 2221,Economy,Taxes and other revenues
29
- 2222,Economy,Budget surplus (+) or deficit (-)
30
- 2186,Economy,Public debt
31
- 2092,Economy,Inflation rate (consumer prices)
32
- 2207,Economy,Central bank discount rate
33
- 2208,Economy,Commercial bank prime lending rate
34
- 2214,Economy,Stock of narrow money
35
- 2215,Economy,Stock of broad money
36
- 2211,Economy,Stock of domestic credit
37
- 2200,Economy,Market value of publicly traded shares
38
- 2187,Economy,Current account balance
39
- 2078,Economy,Exports
40
- 2087,Economy,Imports
41
- 2188,Economy,Reserves of foreign exchange and gold
42
- 2079,Economy,Debt - external
43
- 2198,Economy,Stock of direct foreign investment - at home
44
- 2199,Economy,Stock of direct foreign investment - abroad
45
- 2232,Energy,Electricity - production
46
- 2233,Energy,Electricity - consumption
47
- 2234,Energy,Electricity - exports
48
- 2235,Energy,Electricity - imports
49
- 2236,Energy,Electricity - installed generating capacity
50
- 2237,Energy,Electricity - from fossil fuels
51
- 2239,Energy,Electricity - from nuclear fuels
52
- 2238,Energy,Electricity - from hydroelectric plants
53
- 2240,Energy,Electricity - from other renewable sources
54
- 2241,Energy,Crude oil - production
55
- 2242,Energy,Crude oil - exports
56
- 2243,Energy,Crude oil - imports
57
- 2244,Energy,Crude oil - proved reserves
58
- 2245,Energy,Refined petroleum products - production
59
- 2246,Energy,Refined petroleum products - consumption
60
- 2247,Energy,Refined petroleum products - exports
61
- 2248,Energy,Refined petroleum products - imports
62
- 2249,Energy,Natural gas - production
63
- 2250,Energy,Natural gas - consumption
64
- 2251,Energy,Natural gas - exports
65
- 2252,Energy,Natural gas - imports
66
- 2253,Energy,Natural gas - proved reserves
67
- 2150,Communications,Telephones - fixed lines
68
- 2151,Communications,Telephones - mobile cellular
69
- 2153,Communications,Internet users
70
- 2053,Transportation,Airports
71
- 2121,Transportation,Railways
72
- 2085,Transportation,Roadways
73
- 2093,Transportation,Waterways
74
- 2108,Transportation,Merchant marine
75
- 2034,Military,Military expenditures
@@ -1,187 +0,0 @@
1
-
2
- module Factbook
3
-
4
- class Builder ## todo: change to HtmlBuilder or PageBuilder ???
5
- include LogUtils::Logging
6
-
7
-
8
-
9
- attr_reader :html_original, ## full "original" 1:1 page
10
- :html, ## cut-out and sanitized profile
11
- :html_debug, ## html w/ mapping markers - rename to html_markers - why? why not?
12
- :info, ## page info incl. country_name, region_name, last_updated etc.
13
- :errors, ## encoding erros etc.
14
- :sects
15
-
16
-
17
- def initialize( html_original )
18
- @html_original = html_original
19
-
20
- @html, @info, @errors = Sanitizer.new.sanitize( @html_original )
21
-
22
-
23
- html_sects = if @html.empty?
24
- ## note: support "empty" pages - old format waiting for update!!!
25
- ## cannot parse for now
26
- @html_debug = ''
27
- [] ## return empty (no) sections for now - sorry (its just one page with code cc anyway!!)
28
- else
29
- @html_debug = map_sects( @html )
30
- @html_debug = map_subsects( @html_debug )
31
-
32
- split_sects( @html_debug )
33
- end
34
-
35
- pp html_sects
36
-
37
- ## debug
38
- ## File.open( 'tmp/br.debug.html', 'w:utf-8') { |f| f.write( @html_debug ) }
39
-
40
-
41
- @sects = []
42
- html_sects.each do |html_sect|
43
- html_sect_head = html_sect[0]
44
- html_subsects = html_sect[1]
45
- puts html_sect_head
46
- puts html_subsects.size
47
-
48
- ## get section title
49
- ## @SECTION{Economy} => Economy
50
- if html_sect_head =~ /@SECTION{(.+?)}/
51
- title = $1.strip
52
- puts title
53
- sect = Sect.new
54
- sect.title = title
55
- ## get subsections
56
- subsects = []
57
- html_subsects.each do |html_subsect|
58
- html_subsect_head = html_subsect[0]
59
- html_subsect_body = html_subsect[1]
60
- if html_subsect_head =~ /@SUBSECTION{(.+?)}/
61
- title = $1.strip
62
- title = title.sub( /:\z/, '' ) # remove trailing : if present
63
- title = title.strip
64
-
65
- puts title
66
- subsect = Subsect.new
67
- subsect.title = title ## todo/fix: cut off trailing colon (:)
68
-
69
- b = Factbook::ItemBuilder.new( html_subsect_body, title )
70
- h = b.read
71
- subsect.data = h
72
-
73
- subsects << subsect
74
- else
75
- ## warn/fix: no subsection title found
76
- end
77
- end
78
- sect.subsects = subsects
79
- @sects << sect
80
- else
81
- ## warn/fix: no section title found
82
- end
83
- end
84
- end
85
-
86
-
87
- H2_RE = /<h2>
88
- \s*
89
- (.+?) ## note: use non-greedy; do NOT allow tags inside for now
90
- \s*
91
- <\/h2>
92
- /xim
93
-
94
- def map_sects( html )
95
- ## convert section titles to "unified" marker
96
- ## e.g.
97
- ## <h2>Introduction</h2>
98
-
99
- html = html.gsub( H2_RE ) do |m|
100
- puts "** found section >#{$1}<:"
101
- puts " >|#{m}|<"
102
-
103
- "\n\n@SECTION{#{$1}}\n\n"
104
- end
105
- html
106
- end
107
-
108
-
109
- H3_RE = /<h3>
110
- \s*
111
- (.+?) ## note: use non-greedy; allows tags inside - why? why not
112
- \s*
113
- <\/h3>
114
- /xim
115
-
116
- def map_subsects( html )
117
- ## convert subsection titles to "unified" marker
118
- ## e.g.
119
- ## <h3>Disputes - international:</h3>
120
-
121
- html = html.gsub( H3_RE ) do |m|
122
- puts "** found subsection >#{$1}<:"
123
- puts " >|#{m}|<"
124
-
125
- "\n@SUBSECTION{#{$1}}\n"
126
- end
127
- html
128
- end
129
-
130
-
131
-
132
- def split_sects( html )
133
- ####
134
- # split html in sections (divided by section headings)
135
- # e.g. remove optional prolog ??,
136
- ## [[heading,sect],
137
- ## [heading,sect],
138
- ## [heading,sect],...]
139
-
140
- ## note: "wrap" regex in a capture group (just one)
141
- ## String#split will include all catpure groups in the result array
142
-
143
- ## note: use non-greedy -- check: need to escape {} ??
144
- chunks = html.split( /(@SECTION{.+?})/ )
145
-
146
- ## check if first item is a section or (html) prolog
147
- # if prolog (remove)
148
- chunks.slice!(0) unless chunks[0] =~ /@SECTION/ ## starts w/ @SECTION
149
-
150
- pairs = chunks.each_slice(2).to_a
151
-
152
- ## now split subsections
153
- newpairs = []
154
- pairs.each do |item|
155
- ## todo: after cleanup prolog; remove @SECTION{} ?? - just keep title - why, why not??
156
- newpairs << [item[0], split_subsects( item[1]) ]
157
- end
158
- newpairs
159
- end
160
-
161
-
162
- def split_subsects( html )
163
- ####
164
- # split html in subsections (divided by subsection headings)
165
- # e.g. remove optional prolog ??,
166
- ## [[heading,sect],
167
- ## [heading,sect],
168
- ## [heading,sect],...]
169
-
170
- ## note: "wrap" regex in a capture group (just one)
171
- ## String#split will include all catpure groups in the result array
172
-
173
- ## note: use non-greedy -- check: need to escape {} ??
174
- chunks = html.split( /(@SUBSECTION{.+?})/ )
175
-
176
- ## check if first item is a section or (html) prolog
177
- # if prolog (remove)
178
- chunks.slice!(0) unless chunks[0] =~ /@SUBSECTION/ ## starts w/ @SUBSECTION
179
-
180
- pairs = chunks.each_slice(2).to_a
181
- pairs
182
- end
183
-
184
- end # class Builder
185
-
186
-
187
- end # module Factbook
@@ -1,201 +0,0 @@
1
-
2
- module Factbook
3
-
4
- class ItemBuilder ## renameto ItemReader, ItemParser - why? why not??
5
- include LogUtils::Logging
6
- include NormalizeHelper ## e.g. normalize_category
7
-
8
- def initialize( html, name )
9
- @html = html
10
- @name = name # add category/field name e.g. Area, Location, etc.
11
- end
12
-
13
-
14
-
15
- ##
16
- ## <div class="category_data subfield text">
17
- ## Portuguese (official and most widely spoken language)
18
- ##
19
- ## </div>
20
- ## <div class="category_data note">
21
- ## <p><strong>note:</strong> less common languages include Spanish (border areas and schools), German, Italian, Japanese, English, and a large number of minor Amerindian languages</p>
22
- ## </div>
23
-
24
-
25
- def read
26
- ## return hash from html snippet
27
- doc = Nokogiri::HTML.fragment( @html )
28
-
29
- data = {}
30
-
31
- ## note:
32
- ## skip whitespace text nodes (e.g. \n\n etc); just use divs
33
- doc_children = doc.children.filter('div')
34
-
35
- puts " parsing >#{@name}< - #{doc_children.size} category_data divs(s):"
36
-
37
- ## hanlde special case for
38
- ## multiple 'grouped_subfield' first
39
- ## e.g. used in
40
- ## - Drinking water source:
41
- ## - Sanitation facility access:
42
-
43
- grouped_children = []
44
- other_children = []
45
-
46
- doc_children.each do |div|
47
- if div['class'] && div['class'].index( 'grouped_subfield' )
48
- grouped_children << div
49
- else
50
- other_children << div
51
- end
52
- end
53
-
54
-
55
- ## note: only use special rule if more than one div marked grouped_
56
- if grouped_children.size > 1
57
- ## continue processing the rest as usual
58
- doc_children = other_children
59
-
60
- key = nil
61
- grouped_children.each do |div|
62
- if !div.css( 'span.subfield-group').empty?
63
- # start a new group
64
- span_group = div.at( 'span.subfield-group')
65
- key = normalize_category( span_group.text.strip )
66
- span_group.replace( '' )
67
-
68
- text = squish( div.text.strip )
69
- puts "new group - category_data key >#{key}<: >#{text}<"
70
- data[ key ] = { 'text' => text }
71
- else
72
- ## append to (last) group
73
- text = squish( div.text.strip )
74
- puts "add group - category_data key >#{key}<: >#{text}<"
75
- data[ key ]['text'] += " / #{text}"
76
- end
77
- end
78
- end
79
-
80
-
81
- doc_children.each_with_index do |div,i|
82
- if div['class'] && div['class'].index( 'category_data' )
83
- if div['class'].index( 'note' )
84
- text = squish( div.text.strip )
85
- puts "category_data: >#{text}<"
86
-
87
- ## note: for now only allow one note per subsection/field data block
88
- if data['note']
89
- puts "!! ERROR: note already taken:"
90
- puts data['note']
91
- puts div.to_html
92
- exit 1
93
- end
94
-
95
- ## note: add note directly (that is, W/O extra hash and text node/key)
96
- data['note'] = text
97
- elsif div['class'].index( 'historic' )
98
- ## add all historic together into one for now
99
- text = squish( div.text.strip )
100
- puts "category_data: >#{text}<"
101
-
102
- if data['text']
103
- ## append with / for now
104
- data['text'] += " / #{text}"
105
- else
106
- data['text'] = text
107
- ## check if history is first node
108
- if i != 0
109
- puts "!! ERROR: expected first historic node to be first node but it is #{i+1}:"
110
- puts div.to_html
111
- exit 1
112
- end
113
- end
114
- elsif div.css( 'span.subfield-name').empty?
115
- ## assume "implied text field"
116
- ## check for index == 1 / child count == 1 - why? why not
117
- text = squish( div.text.strip ) ## fix/todo: use strip
118
- puts "category_data: >#{text}<"
119
-
120
- data['text'] = text
121
-
122
- ## must be always first node for now
123
- if i != 0
124
- puts "!! ERROR - 'implied' category W/O name NOT first div / node:"
125
- puts div.to_html
126
- exit 1
127
- end
128
- elsif div['class'].index( 'grouped_subfield' )
129
- ## split grouped subfield!!
130
- ## <span class="subfield-name">arable land:</span>
131
- ## <span class="subfield-number">8.6%</span>
132
- ## <span class="subfield-date">(2011 est.)</span>
133
- ## /
134
- ## <span class="subfield-name">permanent crops:</span>
135
- ## <span class="subfield-number">0.8%</span>
136
- ## <span class="subfield-date">(2011 est.)</span>
137
- ## /
138
- ## <span class="subfield-name">permanent pasture:</span>
139
- ## <span class="subfield-number">23.5%</span>
140
- ## <span class="subfield-date">(2011 est.)</span>
141
-
142
- ## join names for now - why? why not?
143
- ## e.g. becomes:
144
- ## arable land / permanent crops / permanent pasture: for key ??
145
- span_names = div.css( 'span.subfield-name')
146
- keys = []
147
- span_names.each do |span|
148
- keys << normalize_category( span.text.strip )
149
- span.replace( '' )
150
- end
151
- key = keys.join( ' / ')
152
- text = squish( div.text.strip )
153
- puts "category_data key >#{key}<: >#{text}<"
154
- data[ key ] = { 'text' => text }
155
- else
156
- ## get subfield name
157
- span_names = div.css( 'span.subfield-name')
158
- if span_names.size > 1
159
- puts "!! ERROR - found more than one subfield-name:"
160
- puts div.to_html
161
- exit 1
162
- end
163
- key = normalize_category( span_names[0].text.strip )
164
- span_names[0].replace( '' )
165
-
166
- text = squish( div.text.strip )
167
- puts "category_data key >#{key}<: >#{text}<"
168
- data[ key ] = { 'text' => text }
169
- end
170
- else
171
- text = squish( div.text.strip )
172
- if text =~ /country\s+
173
- comparison\s+
174
- to\s+
175
- the\s+
176
- world:\s+
177
- ([0-9]+)/xim
178
- data[ 'country comparison to the world' ] = $1.to_i
179
- else
180
- puts "!! ERROR: div (W/O category_data class):"
181
- puts div.to_html
182
- exit 1
183
- end
184
- end
185
- end
186
-
187
-
188
- pp data
189
- data
190
- end
191
-
192
-
193
-
194
-
195
- def squish( str )
196
- str.gsub( /[ \t\n\r]{2,}/, ' ') ## replace multi-spaces (incl. newlines with once space)
197
- end
198
-
199
- end # class ItemBuilder
200
-
201
- end # module Factbook