RubyGems - factbook-readers - Versions diffs - 1.0.0 → 1.0.1 - Mend

factbook-readers 1.0.0 → 1.0.1

Files changed (16) hide show

checksums.yaml +4 -4
data/README.md +6 -8
data/lib/factbook-readers/builder_item.rb +20 -4
data/lib/factbook-readers/codes.rb +0 -2
data/lib/factbook-readers/comparisons.rb +0 -1
data/lib/factbook-readers/counter.rb +5 -6
data/lib/factbook-readers/normalize.rb +3 -4
data/lib/factbook-readers/page_info.rb +0 -1
data/lib/factbook-readers/reader_json.rb +4 -5
data/lib/factbook-readers/sanitizer.rb +76 -11
data/lib/factbook-readers/sect.rb +3 -4
data/lib/factbook-readers/subsect.rb +0 -1
data/lib/factbook-readers/table.rb +7 -8
data/lib/factbook-readers/utils_info.rb +0 -1
data/lib/factbook-readers/version.rb +1 -1
metadata +1 -1

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: d9bc3eaf2cb6fa3774e0b7a25b53336be2b05a55
-  data.tar.gz: 86565dc92913645110beec803d7bc0a7c088155f
+  metadata.gz: 2bc67eb2f60367d8d0ef00ca718c7d8b81b4a9c8
+  data.tar.gz: f61389d6a073db31e79766c2711bbabb89b27699
 SHA512:
-  metadata.gz: 755b8727d0bbcaecd97f52064b1b29321e1b59a72bff55bbdd995ed8968732def7480f4cab0f222bf4c9d163afbd5230647237b96d41dc136006f0f9a9473550
-  data.tar.gz: 638dcf4f4a552c705a743c9e7483e457303d2090a9204ee3d4b390c3256b537050ea4fdc0957cba461dcee549ecf52b541c4b691dcb8c500c4439eaf376d4a87
+  metadata.gz: 3a565e36afae190e18154bc366bbd3d1a77f06e0c51017ba34893448fd2588fa57e2c93647ef3de5338b423e63787ef248a56c512b67504a522122bb4b24e0ff
+  data.tar.gz: 1cd6b487cb5fb2a2c5b659d2dacf0481ff5368f2e85f977c145bcf46e94f16d0543bbb59dd61995fb1b137c3bb4654308119c89b6ffa7883b0023684101b17dc

data/README.md CHANGED

@@ -33,20 +33,18 @@ resulting in:
     {"total"=>{"text"=>"8,515,770 sq km"},
      "land"=>{"text"=>"8,358,140 sq km"},
      "water"=>{"text"=>"157,630 sq km"},
-     "note"=>
-      {"text"=>
-        "includes Arquipelago de Fernando de Noronha, Atol das Rocas, ..."}},
+     "note"=> "includes Arquipelago de Fernando de Noronha, Atol das Rocas, ..."},
    "Area - comparative"=>
     {"text"=>"slightly smaller than the US"},
    "Land boundaries"=>
     {"total"=>{"text"=>"16,145 km"},
-     "border countries (10)"=>
+     "border countries"=>
       {"text"=>
         "Argentina 1,263 km, Bolivia 3,403 km, Colombia 1,790 km,
         French Guiana 649 km, Guyana 1,308 km, Paraguay 1,371 km, Peru 2,659 km,
         Suriname 515 km, Uruguay 1,050 km, Venezuela 2,137 km"}},
    "Climate"=>{"text"=>"mostly tropical, but temperate in south"},
-   "Elevation extremes"=>
+   "Elevation"=>
     {"lowest point"=>{"text"=>"Atlantic Ocean 0 m"},
      "highest point"=>{"text"=>"Pico da Neblina 2,994 m"}},
    "Natural resources"=>
@@ -66,7 +64,7 @@ pp page['Geography']['Area']['land']['text']
 # => "8,358,140 sq km"
 pp page['Geography']['Area']['water']['text']
 # => "157,630 sq km"
-pp page['Geography']['Area']['note']['text']
+pp page['Geography']['Area']['note']
 # => "includes Arquipelago de Fernando de Noronha, Atol das Rocas, ..."
 pp page['Geography']['Area - comparative']['text']
 # => "slightly smaller than the US"
@@ -74,9 +72,9 @@ pp page['Geography']['Climate']['text']
 # => "mostly tropical, but temperate in south"
 pp page['Geography']['Terrain']['text']
 # => "mostly flat to rolling lowlands in north; ..."
-pp page['Geography']['Elevation extremes']['lowest point']['text']
+pp page['Geography']['Elevation']['lowest point']['text']
 # => "Atlantic Ocean 0 m"
-pp page['Geography']['Elevation extremes']['highest point']['text']
+pp page['Geography']['Elevation']['highest point']['text']
 # => "Pico da Neblina 2,994 m"
 pp page['Geography']['Natural resources']['text']
 # => "bauxite, gold, iron ore, manganese, nickel, phosphates, ..."

data/lib/factbook-readers/builder_item.rb CHANGED

@@ -1,4 +1,3 @@
-# encoding: utf-8
 module Factbook
@@ -45,7 +44,7 @@ def read
   other_children   = []
   doc_children.each do |div|
-     if div['class'].index( 'grouped_subfield' )
+     if div['class'] && div['class'].index( 'grouped_subfield' )
         grouped_children << div
      else
         other_children << div
@@ -79,7 +78,8 @@ def read
   end
-  doc_children.each_with_index do |div,i|
+doc_children.each_with_index do |div,i|
+  if div['class'] && div['class'].index( 'category_data' )
     if div['class'].index( 'note' )
       text = squish( div.text.strip )
       puts "category_data: >#{text}<"
@@ -92,7 +92,8 @@ def read
         exit 1
       end
-      data['note'] = { 'text' => text }
+      ## note: add note directly (that is, W/O extra hash and text node/key)
+      data['note'] = text
     elsif div['class'].index( 'historic' )
       ## add all historic together into one for now
         text = squish( div.text.strip )
@@ -166,7 +167,22 @@ def read
       puts "category_data key >#{key}<: >#{text}<"
       data[ key ] = { 'text' => text }
     end
+  else
+      text = squish( div.text.strip )
+      if text =~ /country\s+
+                  comparison\s+
+                  to\s+
+                  the\s+
+                  world:\s+
+                  ([0-9]+)/xim
+        data[ 'country comparison to the world' ] = $1.to_i
+      else
+        puts "!! ERROR: div (W/O category_data class):"
+        puts div.to_html
+        exit 1
+      end
   end
+end
   pp data

data/lib/factbook-readers/codes.rb CHANGED

@@ -1,5 +1,3 @@
-# encoding: utf-8
 ##
 # note:
 #   the factbook category/region for world is other entities (on FAQ) and oceans in page

data/lib/factbook-readers/comparisons.rb CHANGED

@@ -1,4 +1,3 @@
-# encoding: utf-8
 module Factbook

data/lib/factbook-readers/counter.rb CHANGED

@@ -1,4 +1,3 @@
-# encoding: utf-8
 module Factbook
@@ -24,20 +23,20 @@ def walk( page, hin, hout )
    hin.each do |k,v|
      if v.is_a? Hash
         hout2 =  hout[k] || { count: 0, codes: '' }
         hout2[ :count ] += 1
         ## delete codes if larger (treshhold) than x (e.g. 9)
         hout2.delete( :codes )    if hout2[ :count ] > 9
         codes = hout2[ :codes ]
         if codes    ## note: might got deleted if passed treshhold (e.g. 9 entries)
           codes << ' '  unless codes.empty?   ## add separator (space for now)
-          codes << page.info.country_code
+          codes << page.info.country_code
           hout2[ :codes ] = codes
         end
-        hout[k] = hout2
+        hout[k] = hout2
         walk( page, v, hout2 )
      end
    end

data/lib/factbook-readers/normalize.rb CHANGED

@@ -1,6 +1,5 @@
-# encoding: utf-8
-module Factbook
+module Factbook
   module NormalizeHelper
@@ -17,7 +16,7 @@ def normalize_category( text )
   ##   typos e.g ntoe => use note
   text = 'note'                         if text == 'ntoe'
-  text = 'investment in fixed capital'  if text == 'investment if fixed capital'
+  text = 'investment in fixed capital'  if text == 'investment if fixed capital'
   ##  downcase
   text = 'lowest point'    if text == 'Lowest point'
@@ -34,7 +33,7 @@ def normalize_category( text )
   ##    border countries (8):   -- remove (x) counter
   text = 'border countries'   if text.start_with?( 'border countries')
   text
 end

data/lib/factbook-readers/page_info.rb CHANGED

@@ -1,4 +1,3 @@
-# encoding: utf-8
 module Factbook

data/lib/factbook-readers/reader_json.rb CHANGED

@@ -1,4 +1,3 @@
-# encoding: utf-8
 module Factbook
@@ -12,7 +11,7 @@ def read_page( code )
   path = "#{@json_dir}/#{region_to_slug(code.region)}/#{code.code}.json"
   puts "reading #{code.code} #{code.name} (#{code.region}) [#{path}]..."
-  json = File.read( path )
+  json = File.read( path, 'r:utf-8' ) { |f| f.read }
 ## todo/fix/quick hack: for now until we have a proper header/meta/info section in json
 #    add some page info from code struct
@@ -21,7 +20,7 @@ def read_page( code )
   info.country_code = code.code
   info.country_name = code.name
   info.region_name  = code.region
   page = Page.new( code.code, json: json, info: info )
   page
 end
@@ -31,8 +30,8 @@ def read_pages( codes, limit: nil )
   i=0
   codes.each do |code|
     next if limit && i > limit   ## for debugging just process first x entries
-    pages << read_page( code )
+    pages << read_page( code )
   end
   pages
 end

data/lib/factbook-readers/sanitizer.rb CHANGED

@@ -114,19 +114,65 @@ def find_country_profile( html )
                                       }
     puts " #{li_children.size} div(s) in >#{section_title}<:"
+    ## check special case in world  Geographic overview:
+#    <div class="category oce_light" style="padding-left:5px;"
+#       id="field-anchor-geography-geographic-overview">
+#           Geographic overview:
+#       <span class="field-listing-link">
+#            <a href="../fields/275.html#XX">
+#              <img alt="Geographic overview field listing"
+#                   title="Geographic overview field listing"
+#                   src="../images/field_listing.gif" /></a>
+#         </span>
+#</div>
+# vs regular
+#
+# <div class="category oce_light" style="padding-left:5px;"
+#       id="field-anchor-geography-area-comparative">
+#      <span class="btn-tooltip definition" role="tooltip" aria-hidden='true'>
+ #       <a aria-label="Use this link to access a description of the Area - comparative field"
+ #            href="../docs/notesanddefs.html#280">
+ #            Area - comparative
+ #         </a>:
+ #       <span class="tooltip-content">
+ #           This entry provides an area comparison based on total area equivalents. Most entities are compared with the entire US or one of the 50 states based on area measurements (1990 revised) provided by the US Bureau of the Census. The smaller entities are compared with Washington, DC (178 sq km, 69 sq mi) or The Mall in Washington, DC (0.59 sq km, 0.23 sq mi, 146 acres).
+ #       </span>
+ #     </span>
+  #    <span class="field-listing-link">
+  #        <a href="../fields/280.html#XX"><img alt="Area - comparative field listing" title="Area - comparative field listing" src="../images/field_listing.gif" /></a>
+  #    </span>
+  # </div>
     li_children.each_slice(2) do |divs|
       div = divs[0]
-      a = div.css('a')[0]
-      if a
-        subsection_title = a.text   ## todo/check/rename: use field_name or such - why? why not?
-        html << "\n<h3>#{subsection_title}:</h3>\n"
-      else
-        subsection_title = '???'
-        puts "!! WARN: no anchor found:"
-        puts div.to_html
+      ## try new way - try clean-up / rm first
+      span_tooltip_content = div.at( 'span.tooltip-content' )
+      if span_tooltip_content
+        span_tooltip_content.inner_html = ''
+        span_tooltip_content.replace( '' )  ## check for how to delete/remove - why? why not!!
       end
+      span_field_listing_link = div.at( 'span.field-listing-link' )
+      if span_field_listing_link
+        span_field_listing_link.inner_html = ''
+        span_field_listing_link.replace( '' )
+      end
+      subsection_title = div.text.strip
+      html << "\n<h3>#{subsection_title}</h3>\n"
+      # a = div.css('a')[0]
+      # if a
+      #  subsection_title = a.text   ## todo/check/rename: use field_name or such - why? why not?
+      #  html << "\n<h3>#{subsection_title}:</h3>\n"
+      # else
+      #  subsection_title = '???'
+      #  puts "!! WARN: no anchor found:"
+      #  puts div.to_html
+      # end
       div = divs[1]
       div_children = div.children.select {|el| el.name == 'div' ? true : false }
@@ -157,7 +203,19 @@ def find_country_profile( html )
           end
          else
             if catdiv.to_html.index( 'country comparison to the world' )
-              ## silently skip for now country comparision
+              ## simplify/unlinkify country comparision
+              ## <div>
+              ##  <span class='category'>country comparison to the world:</span>
+              ##  <span class='category_data'>
+              ##    <a href="../fields/335rank.html#AU">97</a>
+              ##  </span>
+              ## </div>
+              ##  e.g. to =>
+              ##  <div>
+              ##    country comparison to the world: 97
+              ##  </div>
+              html << "<div>\n  #{squish( catdiv.text.strip )}\n</div>"
+              html << "\n"
             else
               puts "!! ERROR: div (W/O category_data class) in >#{subsection_title}<:"
               puts catdiv.to_html
@@ -229,6 +287,9 @@ def sanitize_data( el, title: )
   ##  see fr (france) in political parties section for example
   ##  todo/check/fix:  check if we need to use unicode char!! and NOT html entity
   inner_html = inner_html.gsub( "&nbsp;", ' ' )
+  ## Unicode Character 'NO-BREAK SPACE' (U+00A0)
+  inner_html = inner_html.gsub( "\u00A0", ' ' )  ## use unicode char
   el.inner_html = inner_html.rstrip + "\n"
@@ -272,13 +333,17 @@ def sanitize_data( el, title: )
   #####
   # "unfancy" smart quotes to ascii - why? why not?
   # e.g.
-  # Following Britain’s victory => Following Britain's victory
+  #   Following Britain’s victory => Following Britain's victory
   html = html.tr( "’", "'" )
+  #   “full floor” House vote     => "full floor" House vote
+  html = html.tr( "“”", '""' )
   html
 end
+def squish( str )
+  str.gsub( /[ \t\n\r]{2,}/, ' ' )  ## replace multi-spaces (incl. newlines with once space)
+end
 end # class Sanitizer

data/lib/factbook-readers/sect.rb CHANGED

@@ -1,4 +1,3 @@
-# encoding: utf-8
 module Factbook
@@ -6,9 +5,9 @@ module Factbook
 class Sect
   include LogUtils::Logging
-  attr_accessor :title        ## use name instead of title - why? why not?
+  attr_accessor :title        ## use name instead of title - why? why not?
   attr_accessor :subsects
   def initialize
     @subsects = []
   end
@@ -16,7 +15,7 @@ class Sect
   def data
     ## convert sects to hash
     @data = {}
     subsects.each_with_index do |subsect,i|
       @data[ subsect.title ] = subsect.data
     end

data/lib/factbook-readers/subsect.rb CHANGED

@@ -1,4 +1,3 @@
-# encoding: utf-8
 module Factbook

data/lib/factbook-readers/table.rb CHANGED

@@ -1,4 +1,3 @@
-# encoding: utf-8
 module Factbook
@@ -9,12 +8,12 @@ module Factbook
 ##
 ##  for now reads in rows with values separated by at least 3+ spaces e.g.:
 ##   see www.cia.gov/library/publications/the-world-factbook/rankorder/rawdata_2119.txt
-## 1      China                      1,367,485,388
-## 2      India                      1,251,695,584
-## 3      European Union             513,949,445
-## 4      United States              321,368,864
-## 5      Indonesia                  255,993,674
-## 6      Brazil                     204,259,812
+## 1      China                      1,367,485,388
+## 2      India                      1,251,695,584
+## 3      European Union             513,949,445
+## 4      United States              321,368,864
+## 5      Indonesia                  255,993,674
+## 6      Brazil                     204,259,812
 class TableReader
@@ -38,7 +37,7 @@ def read
     end
     values = line.split( /[ ]{3,}/ )    ## split three or more spaces - use just two ?? why? why not??
     ## puts line
     ## pp values
     recs << values

data/lib/factbook-readers/utils_info.rb CHANGED

@@ -1,4 +1,3 @@
-# encoding: utf-8
 module Factbook
   module Utils

data/lib/factbook-readers/version.rb CHANGED

@@ -4,7 +4,7 @@ module  Module
 module   Readers
   MAJOR = 1
   MINOR = 0
-  PATCH = 0
+  PATCH = 1
   VERSION = [MAJOR,MINOR,PATCH].join('.')
   def self.version

metadata CHANGED

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: factbook-readers
 version: !ruby/object:Gem::Version
-  version: 1.0.0
+  version: 1.0.1
 platform: ruby
 authors:
 - Gerald Bauer