factbook-readers 1.0.0 → 1.0.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: d9bc3eaf2cb6fa3774e0b7a25b53336be2b05a55
4
- data.tar.gz: 86565dc92913645110beec803d7bc0a7c088155f
3
+ metadata.gz: 2bc67eb2f60367d8d0ef00ca718c7d8b81b4a9c8
4
+ data.tar.gz: f61389d6a073db31e79766c2711bbabb89b27699
5
5
  SHA512:
6
- metadata.gz: 755b8727d0bbcaecd97f52064b1b29321e1b59a72bff55bbdd995ed8968732def7480f4cab0f222bf4c9d163afbd5230647237b96d41dc136006f0f9a9473550
7
- data.tar.gz: 638dcf4f4a552c705a743c9e7483e457303d2090a9204ee3d4b390c3256b537050ea4fdc0957cba461dcee549ecf52b541c4b691dcb8c500c4439eaf376d4a87
6
+ metadata.gz: 3a565e36afae190e18154bc366bbd3d1a77f06e0c51017ba34893448fd2588fa57e2c93647ef3de5338b423e63787ef248a56c512b67504a522122bb4b24e0ff
7
+ data.tar.gz: 1cd6b487cb5fb2a2c5b659d2dacf0481ff5368f2e85f977c145bcf46e94f16d0543bbb59dd61995fb1b137c3bb4654308119c89b6ffa7883b0023684101b17dc
data/README.md CHANGED
@@ -33,20 +33,18 @@ resulting in:
33
33
  {"total"=>{"text"=>"8,515,770 sq km"},
34
34
  "land"=>{"text"=>"8,358,140 sq km"},
35
35
  "water"=>{"text"=>"157,630 sq km"},
36
- "note"=>
37
- {"text"=>
38
- "includes Arquipelago de Fernando de Noronha, Atol das Rocas, ..."}},
36
+ "note"=> "includes Arquipelago de Fernando de Noronha, Atol das Rocas, ..."},
39
37
  "Area - comparative"=>
40
38
  {"text"=>"slightly smaller than the US"},
41
39
  "Land boundaries"=>
42
40
  {"total"=>{"text"=>"16,145 km"},
43
- "border countries (10)"=>
41
+ "border countries"=>
44
42
  {"text"=>
45
43
  "Argentina 1,263 km, Bolivia 3,403 km, Colombia 1,790 km,
46
44
  French Guiana 649 km, Guyana 1,308 km, Paraguay 1,371 km, Peru 2,659 km,
47
45
  Suriname 515 km, Uruguay 1,050 km, Venezuela 2,137 km"}},
48
46
  "Climate"=>{"text"=>"mostly tropical, but temperate in south"},
49
- "Elevation extremes"=>
47
+ "Elevation"=>
50
48
  {"lowest point"=>{"text"=>"Atlantic Ocean 0 m"},
51
49
  "highest point"=>{"text"=>"Pico da Neblina 2,994 m"}},
52
50
  "Natural resources"=>
@@ -66,7 +64,7 @@ pp page['Geography']['Area']['land']['text']
66
64
  # => "8,358,140 sq km"
67
65
  pp page['Geography']['Area']['water']['text']
68
66
  # => "157,630 sq km"
69
- pp page['Geography']['Area']['note']['text']
67
+ pp page['Geography']['Area']['note']
70
68
  # => "includes Arquipelago de Fernando de Noronha, Atol das Rocas, ..."
71
69
  pp page['Geography']['Area - comparative']['text']
72
70
  # => "slightly smaller than the US"
@@ -74,9 +72,9 @@ pp page['Geography']['Climate']['text']
74
72
  # => "mostly tropical, but temperate in south"
75
73
  pp page['Geography']['Terrain']['text']
76
74
  # => "mostly flat to rolling lowlands in north; ..."
77
- pp page['Geography']['Elevation extremes']['lowest point']['text']
75
+ pp page['Geography']['Elevation']['lowest point']['text']
78
76
  # => "Atlantic Ocean 0 m"
79
- pp page['Geography']['Elevation extremes']['highest point']['text']
77
+ pp page['Geography']['Elevation']['highest point']['text']
80
78
  # => "Pico da Neblina 2,994 m"
81
79
  pp page['Geography']['Natural resources']['text']
82
80
  # => "bauxite, gold, iron ore, manganese, nickel, phosphates, ..."
@@ -1,4 +1,3 @@
1
- # encoding: utf-8
2
1
 
3
2
  module Factbook
4
3
 
@@ -45,7 +44,7 @@ def read
45
44
  other_children = []
46
45
 
47
46
  doc_children.each do |div|
48
- if div['class'].index( 'grouped_subfield' )
47
+ if div['class'] && div['class'].index( 'grouped_subfield' )
49
48
  grouped_children << div
50
49
  else
51
50
  other_children << div
@@ -79,7 +78,8 @@ def read
79
78
  end
80
79
 
81
80
 
82
- doc_children.each_with_index do |div,i|
81
+ doc_children.each_with_index do |div,i|
82
+ if div['class'] && div['class'].index( 'category_data' )
83
83
  if div['class'].index( 'note' )
84
84
  text = squish( div.text.strip )
85
85
  puts "category_data: >#{text}<"
@@ -92,7 +92,8 @@ def read
92
92
  exit 1
93
93
  end
94
94
 
95
- data['note'] = { 'text' => text }
95
+ ## note: add note directly (that is, W/O extra hash and text node/key)
96
+ data['note'] = text
96
97
  elsif div['class'].index( 'historic' )
97
98
  ## add all historic together into one for now
98
99
  text = squish( div.text.strip )
@@ -166,7 +167,22 @@ def read
166
167
  puts "category_data key >#{key}<: >#{text}<"
167
168
  data[ key ] = { 'text' => text }
168
169
  end
170
+ else
171
+ text = squish( div.text.strip )
172
+ if text =~ /country\s+
173
+ comparison\s+
174
+ to\s+
175
+ the\s+
176
+ world:\s+
177
+ ([0-9]+)/xim
178
+ data[ 'country comparison to the world' ] = $1.to_i
179
+ else
180
+ puts "!! ERROR: div (W/O category_data class):"
181
+ puts div.to_html
182
+ exit 1
183
+ end
169
184
  end
185
+ end
170
186
 
171
187
 
172
188
  pp data
@@ -1,5 +1,3 @@
1
- # encoding: utf-8
2
-
3
1
  ##
4
2
  # note:
5
3
  # the factbook category/region for world is other entities (on FAQ) and oceans in page
@@ -1,4 +1,3 @@
1
- # encoding: utf-8
2
1
 
3
2
  module Factbook
4
3
 
@@ -1,4 +1,3 @@
1
- # encoding: utf-8
2
1
 
3
2
  module Factbook
4
3
 
@@ -24,20 +23,20 @@ def walk( page, hin, hout )
24
23
  hin.each do |k,v|
25
24
  if v.is_a? Hash
26
25
  hout2 = hout[k] || { count: 0, codes: '' }
27
-
26
+
28
27
  hout2[ :count ] += 1
29
-
28
+
30
29
  ## delete codes if larger (treshhold) than x (e.g. 9)
31
30
  hout2.delete( :codes ) if hout2[ :count ] > 9
32
31
 
33
32
  codes = hout2[ :codes ]
34
33
  if codes ## note: might got deleted if passed treshhold (e.g. 9 entries)
35
34
  codes << ' ' unless codes.empty? ## add separator (space for now)
36
- codes << page.info.country_code
35
+ codes << page.info.country_code
37
36
  hout2[ :codes ] = codes
38
37
  end
39
-
40
- hout[k] = hout2
38
+
39
+ hout[k] = hout2
41
40
  walk( page, v, hout2 )
42
41
  end
43
42
  end
@@ -1,6 +1,5 @@
1
- # encoding: utf-8
2
1
 
3
- module Factbook
2
+ module Factbook
4
3
  module NormalizeHelper
5
4
 
6
5
 
@@ -17,7 +16,7 @@ def normalize_category( text )
17
16
 
18
17
  ## typos e.g ntoe => use note
19
18
  text = 'note' if text == 'ntoe'
20
- text = 'investment in fixed capital' if text == 'investment if fixed capital'
19
+ text = 'investment in fixed capital' if text == 'investment if fixed capital'
21
20
 
22
21
  ## downcase
23
22
  text = 'lowest point' if text == 'Lowest point'
@@ -34,7 +33,7 @@ def normalize_category( text )
34
33
 
35
34
  ## border countries (8): -- remove (x) counter
36
35
  text = 'border countries' if text.start_with?( 'border countries')
37
-
36
+
38
37
  text
39
38
  end
40
39
 
@@ -1,4 +1,3 @@
1
- # encoding: utf-8
2
1
 
3
2
  module Factbook
4
3
 
@@ -1,4 +1,3 @@
1
- # encoding: utf-8
2
1
 
3
2
  module Factbook
4
3
 
@@ -12,7 +11,7 @@ def read_page( code )
12
11
  path = "#{@json_dir}/#{region_to_slug(code.region)}/#{code.code}.json"
13
12
 
14
13
  puts "reading #{code.code} #{code.name} (#{code.region}) [#{path}]..."
15
- json = File.read( path )
14
+ json = File.read( path, 'r:utf-8' ) { |f| f.read }
16
15
 
17
16
  ## todo/fix/quick hack: for now until we have a proper header/meta/info section in json
18
17
  # add some page info from code struct
@@ -21,7 +20,7 @@ def read_page( code )
21
20
  info.country_code = code.code
22
21
  info.country_name = code.name
23
22
  info.region_name = code.region
24
-
23
+
25
24
  page = Page.new( code.code, json: json, info: info )
26
25
  page
27
26
  end
@@ -31,8 +30,8 @@ def read_pages( codes, limit: nil )
31
30
  i=0
32
31
  codes.each do |code|
33
32
  next if limit && i > limit ## for debugging just process first x entries
34
-
35
- pages << read_page( code )
33
+
34
+ pages << read_page( code )
36
35
  end
37
36
  pages
38
37
  end
@@ -114,19 +114,65 @@ def find_country_profile( html )
114
114
  }
115
115
  puts " #{li_children.size} div(s) in >#{section_title}<:"
116
116
 
117
+
118
+ ## check special case in world Geographic overview:
119
+ # <div class="category oce_light" style="padding-left:5px;"
120
+ # id="field-anchor-geography-geographic-overview">
121
+ # Geographic overview:
122
+ # <span class="field-listing-link">
123
+ # <a href="../fields/275.html#XX">
124
+ # <img alt="Geographic overview field listing"
125
+ # title="Geographic overview field listing"
126
+ # src="../images/field_listing.gif" /></a>
127
+ # </span>
128
+ #</div>
129
+ # vs regular
130
+ #
131
+ # <div class="category oce_light" style="padding-left:5px;"
132
+ # id="field-anchor-geography-area-comparative">
133
+ # <span class="btn-tooltip definition" role="tooltip" aria-hidden='true'>
134
+ # <a aria-label="Use this link to access a description of the Area - comparative field"
135
+ # href="../docs/notesanddefs.html#280">
136
+ # Area - comparative
137
+ # </a>:
138
+ # <span class="tooltip-content">
139
+ # This entry provides an area comparison based on total area equivalents. Most entities are compared with the entire US or one of the 50 states based on area measurements (1990 revised) provided by the US Bureau of the Census. The smaller entities are compared with Washington, DC (178 sq km, 69 sq mi) or The Mall in Washington, DC (0.59 sq km, 0.23 sq mi, 146 acres).
140
+ # </span>
141
+ # </span>
142
+ # <span class="field-listing-link">
143
+ # <a href="../fields/280.html#XX"><img alt="Area - comparative field listing" title="Area - comparative field listing" src="../images/field_listing.gif" /></a>
144
+ # </span>
145
+ # </div>
146
+
117
147
  li_children.each_slice(2) do |divs|
118
148
  div = divs[0]
119
- a = div.css('a')[0]
120
149
 
121
- if a
122
- subsection_title = a.text ## todo/check/rename: use field_name or such - why? why not?
123
- html << "\n<h3>#{subsection_title}:</h3>\n"
124
- else
125
- subsection_title = '???'
126
- puts "!! WARN: no anchor found:"
127
- puts div.to_html
150
+ ## try new way - try clean-up / rm first
151
+ span_tooltip_content = div.at( 'span.tooltip-content' )
152
+ if span_tooltip_content
153
+ span_tooltip_content.inner_html = ''
154
+ span_tooltip_content.replace( '' ) ## check for how to delete/remove - why? why not!!
128
155
  end
129
156
 
157
+ span_field_listing_link = div.at( 'span.field-listing-link' )
158
+ if span_field_listing_link
159
+ span_field_listing_link.inner_html = ''
160
+ span_field_listing_link.replace( '' )
161
+ end
162
+
163
+ subsection_title = div.text.strip
164
+ html << "\n<h3>#{subsection_title}</h3>\n"
165
+
166
+ # a = div.css('a')[0]
167
+ # if a
168
+ # subsection_title = a.text ## todo/check/rename: use field_name or such - why? why not?
169
+ # html << "\n<h3>#{subsection_title}:</h3>\n"
170
+ # else
171
+ # subsection_title = '???'
172
+ # puts "!! WARN: no anchor found:"
173
+ # puts div.to_html
174
+ # end
175
+
130
176
 
131
177
  div = divs[1]
132
178
  div_children = div.children.select {|el| el.name == 'div' ? true : false }
@@ -157,7 +203,19 @@ def find_country_profile( html )
157
203
  end
158
204
  else
159
205
  if catdiv.to_html.index( 'country comparison to the world' )
160
- ## silently skip for now country comparision
206
+ ## simplify/unlinkify country comparision
207
+ ## <div>
208
+ ## <span class='category'>country comparison to the world:</span>
209
+ ## <span class='category_data'>
210
+ ## <a href="../fields/335rank.html#AU">97</a>
211
+ ## </span>
212
+ ## </div>
213
+ ## e.g. to =>
214
+ ## <div>
215
+ ## country comparison to the world: 97
216
+ ## </div>
217
+ html << "<div>\n #{squish( catdiv.text.strip )}\n</div>"
218
+ html << "\n"
161
219
  else
162
220
  puts "!! ERROR: div (W/O category_data class) in >#{subsection_title}<:"
163
221
  puts catdiv.to_html
@@ -229,6 +287,9 @@ def sanitize_data( el, title: )
229
287
  ## see fr (france) in political parties section for example
230
288
  ## todo/check/fix: check if we need to use unicode char!! and NOT html entity
231
289
  inner_html = inner_html.gsub( "&nbsp;", ' ' )
290
+ ## Unicode Character 'NO-BREAK SPACE' (U+00A0)
291
+ inner_html = inner_html.gsub( "\u00A0", ' ' ) ## use unicode char
292
+
232
293
 
233
294
  el.inner_html = inner_html.rstrip + "\n"
234
295
 
@@ -272,13 +333,17 @@ def sanitize_data( el, title: )
272
333
  #####
273
334
  # "unfancy" smart quotes to ascii - why? why not?
274
335
  # e.g.
275
- # Following Britain’s victory => Following Britain's victory
336
+ # Following Britain’s victory => Following Britain's victory
276
337
  html = html.tr( "’", "'" )
277
-
338
+ # “full floor” House vote => "full floor" House vote
339
+ html = html.tr( "“”", '""' )
278
340
 
279
341
  html
280
342
  end
281
343
 
344
+ def squish( str )
345
+ str.gsub( /[ \t\n\r]{2,}/, ' ' ) ## replace multi-spaces (incl. newlines with once space)
346
+ end
282
347
 
283
348
 
284
349
  end # class Sanitizer
@@ -1,4 +1,3 @@
1
- # encoding: utf-8
2
1
 
3
2
  module Factbook
4
3
 
@@ -6,9 +5,9 @@ module Factbook
6
5
  class Sect
7
6
  include LogUtils::Logging
8
7
 
9
- attr_accessor :title ## use name instead of title - why? why not?
8
+ attr_accessor :title ## use name instead of title - why? why not?
10
9
  attr_accessor :subsects
11
-
10
+
12
11
  def initialize
13
12
  @subsects = []
14
13
  end
@@ -16,7 +15,7 @@ class Sect
16
15
  def data
17
16
  ## convert sects to hash
18
17
  @data = {}
19
-
18
+
20
19
  subsects.each_with_index do |subsect,i|
21
20
  @data[ subsect.title ] = subsect.data
22
21
  end
@@ -1,4 +1,3 @@
1
- # encoding: utf-8
2
1
 
3
2
  module Factbook
4
3
 
@@ -1,4 +1,3 @@
1
- # encoding: utf-8
2
1
 
3
2
  module Factbook
4
3
 
@@ -9,12 +8,12 @@ module Factbook
9
8
  ##
10
9
  ## for now reads in rows with values separated by at least 3+ spaces e.g.:
11
10
  ## see www.cia.gov/library/publications/the-world-factbook/rankorder/rawdata_2119.txt
12
- ## 1 China 1,367,485,388
13
- ## 2 India 1,251,695,584
14
- ## 3 European Union 513,949,445
15
- ## 4 United States 321,368,864
16
- ## 5 Indonesia 255,993,674
17
- ## 6 Brazil 204,259,812
11
+ ## 1 China 1,367,485,388
12
+ ## 2 India 1,251,695,584
13
+ ## 3 European Union 513,949,445
14
+ ## 4 United States 321,368,864
15
+ ## 5 Indonesia 255,993,674
16
+ ## 6 Brazil 204,259,812
18
17
 
19
18
 
20
19
  class TableReader
@@ -38,7 +37,7 @@ def read
38
37
  end
39
38
 
40
39
  values = line.split( /[ ]{3,}/ ) ## split three or more spaces - use just two ?? why? why not??
41
-
40
+
42
41
  ## puts line
43
42
  ## pp values
44
43
  recs << values
@@ -1,4 +1,3 @@
1
- # encoding: utf-8
2
1
 
3
2
  module Factbook
4
3
  module Utils
@@ -4,7 +4,7 @@ module Module
4
4
  module Readers
5
5
  MAJOR = 1
6
6
  MINOR = 0
7
- PATCH = 0
7
+ PATCH = 1
8
8
  VERSION = [MAJOR,MINOR,PATCH].join('.')
9
9
 
10
10
  def self.version
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: factbook-readers
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.0
4
+ version: 1.0.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Gerald Bauer