RubyGems - factbook - Versions diffs - 1.2.2 → 2.0.0 - Mend

factbook 1.2.2 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

checksums.yaml +4 -4
data/{HISTORY.md → CHANGELOG.md} +3 -3
data/Manifest.txt +1 -1
data/README.md +548 -543
data/Rakefile +34 -33
data/data/codes.csv +262 -262
data/data/codesxref.csv +280 -280
data/lib/factbook.rb +68 -75
data/lib/factbook/builder.rb +14 -3
data/lib/factbook/builder_item.rb +93 -59
data/lib/factbook/page.rb +20 -57
data/lib/factbook/sanitizer.rb +98 -285
data/lib/factbook/version.rb +21 -22
data/script/json.rb +3 -2
data/test/data/src/au.html +658 -658
data/test/data/src/be.html +648 -648
data/test/helper.rb +11 -11
data/test/test_fields.rb +52 -52
data/test/test_json.rb +45 -45
data/test/test_page.rb +38 -38
metadata +31 -11

data/lib/factbook.rb CHANGED

@@ -1,75 +1,68 @@
-# encoding: utf-8
-## stdlibs
-require 'net/http'
-require 'net/https'     ## note: cia factbook requires https
-require 'uri'
-require 'cgi'
-require 'pp'
-require 'json'
-require 'csv'
-require 'fileutils'
-require 'erb'     ## used by Almanac class (for render)
-## 3rd party gems/libs
-## require 'props'
-require 'logutils'
-require 'fetcher'
-require 'nokogiri'
-require 'active_record'     ## add activerecord/db support (NOT optional for now)
-# our own code
-require 'factbook/version' # let it always go first
-require 'factbook/codes'
-require 'factbook/comparisons'
-require 'factbook/attributes'
-module Factbook
-  ##  auto-load builtin codes, comparisons, attributes, etc.
-  CODES       = Codes.from_csv( "#{Factbook.root}/data/codes.csv" )
-  COMPARISONS = Comparisons.from_csv( "#{Factbook.root}/data/comparisons.csv" )
-  ATTRIBUTES  = Attributes.from_yaml( "#{Factbook.root}/data/attributes.yml" )
-  def self.codes()       CODES; end
-  def self.comparisons() COMPARISONS; end
-  def self.attributes()  ATTRIBUTES; end
-end # module Factbook
-## note: make codes, comparisons, attributes available
-require 'factbook/utils'
-require 'factbook/utils_info'
-require 'factbook/sanitizer'
-require 'factbook/normalize'
-require 'factbook/builder_item'
-require 'factbook/builder'
-require 'factbook/builder_json'
-require 'factbook/page'
-require 'factbook/page_info'
-require 'factbook/sect'
-require 'factbook/subsect'
-require 'factbook/reader_json'
-require 'factbook/almanac'
-require 'factbook/table'    ## e.g. TableReader
-require 'factbook/counter'
-require 'factbook/db/schema'   ## database (sql tables) support
-require 'factbook/db/models'
-require 'factbook/db/importer'
-puts Factbook.banner     if defined?($RUBYLIBS_DEBUG) && $RUBYLIBS_DEBUG
+## stdlibs
+require 'cgi'
+require 'csv'   ## fix: use csvreader!!!!
+require 'erb'     ## used by Almanac class (for render)
+## 3rd party gems/libs
+## require 'props'
+require 'logutils'
+require 'webget'
+require 'nokogiri'
+require 'active_record'     ## add activerecord/db support (NOT optional for now)
+# our own code
+require 'factbook/version' # let it always go first
+require 'factbook/codes'
+require 'factbook/comparisons'
+require 'factbook/attributes'
+module Factbook
+  ##  auto-load builtin codes, comparisons, attributes, etc.
+  CODES       = Codes.from_csv( "#{Factbook.root}/data/codes.csv" )
+  COMPARISONS = Comparisons.from_csv( "#{Factbook.root}/data/comparisons.csv" )
+  ATTRIBUTES  = Attributes.from_yaml( "#{Factbook.root}/data/attributes.yml" )
+  def self.codes()       CODES; end
+  def self.comparisons() COMPARISONS; end
+  def self.attributes()  ATTRIBUTES; end
+end # module Factbook
+## note: make codes, comparisons, attributes available
+require 'factbook/utils'
+require 'factbook/utils_info'
+require 'factbook/sanitizer'
+require 'factbook/normalize'
+require 'factbook/builder_item'
+require 'factbook/builder'
+require 'factbook/builder_json'
+require 'factbook/page'
+require 'factbook/page_info'
+require 'factbook/sect'
+require 'factbook/subsect'
+require 'factbook/reader_json'
+require 'factbook/almanac'
+require 'factbook/table'    ## e.g. TableReader
+require 'factbook/counter'
+require 'factbook/db/schema'   ## database (sql tables) support
+require 'factbook/db/models'
+require 'factbook/db/importer'
+puts Factbook.banner

data/lib/factbook/builder.rb CHANGED

@@ -40,12 +40,23 @@ def initialize( html_ascii )
   ## todo/fix: use/assume windows 12xx?? encoding - change encoding to utf-8  (from binary/ascii8bit)
   @html, @info, @errors = Sanitizer.new.sanitize( @html_ascii )
-  @html_debug = map_sects( @html )
-  @html_debug = map_subsects( @html_debug )
-  html_sects = split_sects( @html_debug )
+  html_sects =  if @html.empty?
+                   ## note: support "empty" pages - old format waiting for update!!!
+                   ##    cannot parse for now
+                   []  ## return empty (no) sections for now - sorry (its just one page with code cc anyway!!)
+                else
+                   @html_debug = map_sects( @html )
+                   @html_debug = map_subsects( @html_debug )
+                   split_sects( @html_debug )
+                end
   pp html_sects
+  ## debug
+  ## File.open( 'tmp/br.debug.html', 'w:utf-8') { |f| f.write( @html_debug ) }
   @sects = []
   html_sects.each do |html_sect|

data/lib/factbook/builder_item.rb CHANGED

@@ -5,88 +5,122 @@ module Factbook
 class ItemBuilder       ## renameto ItemReader, ItemParser - why? why not??
   include LogUtils::Logging
   include NormalizeHelper    ##  e.g. normalize_category
 def initialize( html, name )
   @html = html
   @name = name     # add category/field name e.g. Area, Location, etc.
 end
+##
+## <div class="category_data subfield text">
+## Portuguese  (official and most widely spoken language)
+##
+## </div>
+## <div class="category_data note">
+## <p><strong>note:</strong> less common languages include Spanish (border areas and schools), German, Italian, Japanese, English, and a large number of minor Amerindian languages</p>
+## </div>
 def read
   ## return hash from html snippet
   doc = Nokogiri::HTML.fragment( @html )
   data = {}
-  last_node = nil     ## track last hash (always use text key)
-  last_node_data_count = 0
   ## note:
   ##   skip whitespace text nodes (e.g. \n\n etc); just use divs
-  doc.children.filter('div').each_with_index do |child,i|
-    if child['class'] == 'category_data'
-       text = child.text    ## fix/todo: use strip
-       puts "category_data: >#{text}<"
-       if last_node.nil?
-          ## assume its the very first entry; use implied/auto-created category
-          data['text'] = ''
-          last_node = data
-          last_node_data_count = 0
-       end
-       ### first category_data element?
-      if last_node_data_count == 0
-         if last_node['text'] == ''
-            last_node['text'] = text
-         else   ### possible ??? if data_count is zero - not should not include any data
-            ## todo: issue warning here - why? why not??
-            last_node['text'] += " #{text}"    ## append w/o separator
-         end
-      else
-        if @name == 'Demographic profile'  ## special case (use space a sep)
-            last_node['text'] += " #{text}"   ## append without (w/o) separator
+  doc_children = doc.children.filter('div')
+  puts "  parsing >#{@name}< - #{doc_children.size} category_data divs(s):"
+  doc_children.each_with_index do |div,i|
+    if div['class'].index( 'note' )
+      text = squish( div.text.strip )
+      puts "category_data: >#{text}<"
+      data['note'] = { 'text' => text }
+    elsif div['class'].index( 'historic' )
+      ## add all historic together into one for now
+        text = squish( div.text.strip )
+        puts "category_data: >#{text}<"
+        if i == 0
+          data['text'] = text
         else
-            last_node['text'] += " ++ #{text}"   ## append with ++ separator
+          ## append with / for now
+          data['text'] += " / #{text}"
         end
-      end
-      last_node_data_count += 1
+      elsif div.css( 'span.subfield-name').empty?
+      ## assume "implied text field"
+      ## check for index == 1 / child count == 1 - why? why not
+      text = squish( div.text.strip )    ## fix/todo: use strip
+      puts "category_data: >#{text}<"
-    elsif child['class'].nil?    ## div without any class e.g. <div>..</div>
-                                 ##   assume category and category_data pair w/ spans
-      spans = child.children.filter('span')
-      if spans.size > 2
-        puts "*** warn: expected two (or one) spans; got #{spans.inspect}"
+      data['text'] = text
+      ## must be always first node for now
+      if i != 0
+        puts "!! ERROR - 'implied' category W/O name NOT first div / node:"
+        puts @html
+        exit 1
       end
-      ## pp spans
-      span_key   = spans[0]  ## assume 1st entry is span.category
-      span_value = spans[1]  ## assume 2nd entry is span.category_data
-      key   = normalize_category( span_key.text )
-      ## note: allow optional category_data for now
-      value = span_value ? span_value.text : nil
-      puts "key: >#{key}<, value: >#{value}< : #{value.class.name}"
-      ## start new pair
-      last_node = data[key] = { 'text' => value }
-      last_node_data_count =  value ? 1 : 0    ## note: set to 1 if value present
+    elsif div['class'].index( 'grouped_subfield' )
+## split grouped subfield!!
+##   <span class="subfield-name">arable land:</span>
+## <span class="subfield-number">8.6%</span>
+## <span class="subfield-date">(2011 est.)</span>
+##  /
+## <span class="subfield-name">permanent crops:</span>
+## <span class="subfield-number">0.8%</span>
+## <span class="subfield-date">(2011 est.)</span>
+##   /
+## <span class="subfield-name">permanent pasture:</span>
+## <span class="subfield-number">23.5%</span>
+## <span class="subfield-date">(2011 est.)</span>
+## join names for now - why? why not?
+##  e.g. becomes:
+##   arable land / permanent crops / permanent pasture: for key ??
+     span_names = div.css( 'span.subfield-name')
+     keys = []
+     span_names.each do |span|
+       keys << normalize_category( span.text.strip )
+       span.replace( '' )
+     end
+     key = keys.join( ' / ')
+     text = squish( div.text.strip )
+     puts "category_data key >#{key}<: >#{text}<"
+     data[ key ] = { 'text' => text }
     else
-      puts "*** warn: item builder -- unknow css class in #{child.inspect}"
+      ## get subfield name
+      span_names = div.css( 'span.subfield-name')
+      if span_names.size > 1
+        puts "!! ERROR - found more than one subfield-name:"
+        puts div.to_html
+        exit 1
+      end
+      key = normalize_category( span_names[0].text.strip )
+      span_names[0].replace( '' )
+      text = squish( div.text.strip )
+      puts "category_data key >#{key}<: >#{text}<"
+      data[ key ] = { 'text' => text }
     end
-    ## pp child
-    ## css = child['class']
-    ## puts "[#{i}] #{child.name}  class='>#{css}< : #{css.class.name}' >#{child.text}<"
   end
   pp data
   data
 end
+def squish( str )
+  str.gsub( /[ \t\n\r]{2,}/, ' ')  ## replace multi-spaces (incl. newlines with once space)
+end
 end # class ItemBuilder
 end # module Factbook

data/lib/factbook/page.rb CHANGED

@@ -1,4 +1,3 @@
-# encoding: utf-8
 module Factbook
@@ -38,10 +37,10 @@ class Page
   def initialize( code, opts={} )
     ### keep code - why? why not??  (use page_info/info e.g. info.country_code??)
     if opts[:json]
       json = opts[:json]    ## note: json is (still) a string/text (NOT yet parsed to structured data)
-      b = JsonBuilder.from_string( json )
+      b = JsonBuilder.from_string( json )
     else  ## assume html
       if opts[:html]    ## note: expects ASCII-7BIT/BINARY encoding
          ## for debugging and testing allow "custom" passed-in html page
@@ -49,11 +48,13 @@ class Page
       else
         url_string =  SITE_BASE.gsub( '{code}', code )
         ## note: expects ASCII-7BIT/BINARY encoding
-        html = fetch_page( url_string )   ## use PageFetcher class - why?? why not??
-      end
+        ## html = fetch_page( url_string )   ## use PageFetcher class - why?? why not??
+        html = Webcache.read( url_string )
+      end
       b = Builder.from_string( html )
     end
     @sects = b.sects
     @info  = b.info
@@ -65,7 +66,7 @@ class Page
       @info = info
     end
-    @data = {}
+    @data = {}
     @sects.each do |sect|
       @data[ sect.title ] = sect.data
     end
@@ -78,7 +79,7 @@ class Page
     if opts[:minify]
       data.to_json
     else
-      ## was: -- opts[:pretty] || opts[:pp]
+      ## was: -- opts[:pretty] || opts[:pp]
       JSON.pretty_generate( data )   ## note: pretty print by default!
     end
   end
@@ -96,10 +97,10 @@ class Page
   end
   ## add convenience (shortcut) accessors / attributes / fields / getters
   ATTRIBUTES.each do |attrib|
     ## e.g.
-    ##    def background()  data['Introduction']['Background']['text']; end
+    ##    def background()  data['Introduction']['Background']['text']; end
     ##    def location()    data['Geography']['Location']['text'];      end
     ##    etc.
     if attrib.path.size == 1
@@ -114,31 +115,18 @@ class Page
               fetch( attrib.path[1], {} )['text']
       end
     end
-  end
+  end
 private
-  def fetch_page( url_string )
-    worker = Fetcher::Worker.new
-    response = worker.get_response( url_string )
-    if response.code == '200'
-      t = response.body
-      ###
-      # NB: Net::HTTP will NOT set encoding UTF-8 etc.
-      # will mostly be ASCII
-       # - try to change encoding to UTF-8 ourselves
-      logger.debug "t.encoding.name (before): #{t.encoding.name}"
-      #####
-      # NB: ASCII-8BIT == BINARY == Encoding Unknown; Raw Bytes Here
-      t
-    else
-      logger.error "fetch HTTP - #{response.code} #{response.message}"
-      ## todo/fix: raise http exception (see fetcher)  -- why? why not??
-      fail "fetch HTTP - #{response.code} #{response.message}"
-      nil
-    end
+  def fetch_page( url )
+    response = Webget.page( url )
+    ## note: exit on get / fetch error - do NOT continue for now - why? why not?
+    exit 1   if response.status.nok?    ## e.g.  HTTP status code != 200
+    response.text
   end
@@ -157,29 +145,4 @@ end
 end # class Page
-=begin
-class PageFetcher
-def fetch( cc )
-  worker = Fetcher::Worker.new
-  factbook_base = 'https://www.cia.gov/library/publications/the-world-factbook/geos'
-  res = worker.get_response( "#{factbook_base}/#{cc}.html" )
-  # on error throw exception - why? why not??
-  if res.code != '200'
-    raise Fetcher::HttpError.new( res.code, res.message )
-  end
-  ###
-  # Note: Net::HTTP will NOT set encoding UTF-8 etc.
-  #   will be set to ASCII-8BIT == BINARY == Encoding Unknown; Raw Bytes Here
-  html = res.body.to_s
-end
-end # PageFetcher
-=end
 end # module Factbook