RubyGems - factbook - Versions diffs - 0.1.3 → 1.0.0 - Mend

factbook 0.1.3 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (46) hide show

checksums.yaml +7 -0
data/Manifest.txt +34 -22
data/README.md +8 -3
data/Rakefile +2 -263
data/data/codes.csv +262 -0
data/data/comparisons.csv +75 -0
data/lib/factbook/builder.rb +214 -0
data/lib/factbook/builder_item.rb +93 -0
data/lib/factbook/codes.rb +119 -0
data/lib/factbook/comparisons.rb +50 -0
data/lib/factbook/page.rb +103 -303
data/lib/factbook/sanitizer.rb +214 -0
data/lib/factbook/sect.rb +29 -196
data/lib/factbook/subsect.rb +18 -0
data/lib/factbook/table.rb +52 -0
data/lib/factbook/utils.rb +85 -0
data/lib/factbook/utils_info.rb +102 -0
data/lib/factbook/version.rb +4 -3
data/lib/factbook.rb +23 -1
data/test/data/au.html +579 -0
data/test/data/au.yml +8 -0
data/test/data/be.html +596 -0
data/test/data/be.yml +8 -0
data/test/data/src/au.html +2006 -0
data/test/data/src/be.html +2011 -0
data/test/helper.rb +0 -4
data/test/test_builder.rb +37 -0
data/test/test_codes.rb +76 -0
data/test/test_comparisons.rb +19 -0
data/test/test_fields.rb +21 -18
data/test/test_item_builder.rb +99 -0
data/test/test_json.rb +17 -20
data/test/test_page.rb +18 -10
data/test/test_sanitizer.rb +35 -0
metadata +68 -49
data/.gemtest +0 -0
data/test/data/countrytemplate_au.html +0 -4179
data/test/data/countrytemplate_be.html +0 -4260
data/test/data/countrytemplate_br.html +0 -4366
data/test/data/countrytemplate_ee.html +0 -2999
data/test/data/countrytemplate_ls.html +0 -2728
data/test/data/countrytemplate_mx.html +0 -4397
data/test/data/countrytemplate_vt.html +0 -1726
data/test/data/countrytemplate_xx.html +0 -2898
data/test/test_page_old.rb +0 -478
data/test/test_strip.rb +0 -66

data/lib/factbook/sect.rb CHANGED Viewed

@@ -1,196 +1,29 @@
-# encoding: utf-8
-module Factbook
-  class Sect   # section (e.g. Introduction/Geography/People/Economy/Energy/Transport/etc.)
-    include LogUtils::Logging
-    attr_reader :title, :html
-    def initialize( title, html, opts={} )
-      ## todo: passing a ref to the parent page - why? why not??
-      @title = title
-      @html  = html
-      @opts  = opts    # fields:  full|long|keep|std|???  -- find a good name for the option keeping field names as is
-      @doc   = nil
-      @data  = nil
-    end
-    def doc
-      ### check: use nokogiri html fragment? why? why not??
-      @doc ||= Nokogiri::HTML( @html )
-    end
-    def data
-      @data ||= sect_to_hash( doc )
-    end
-private
-  def cleanup_key( key )
-    if @opts[:fields]    #  if set assume full|long|keep for now
-      ### kepe field names as is
-      ##  e.g.
-      ##   GDP - composition, by sector of origin:
-      ##   Budget surplus (+) or deficit (-):
-      ##  becomes:
-      ##   GDP - composition, by sector of origin
-      ##   Budget surplus (+) or deficit (-)
-      key = key.strip
-      key = key.gsub( /[ ]{2,}/, ' ' )   # fold two plus spaces into one  -- check if exists?
-      key = key.gsub( /:\z/, '' )    # remove trailing : if present
-      key = key.strip
-    else
-      ## to lower case
-      key = key.downcase
-      ## seaport(s)  => seaports
-      key = key.gsub( '(s)', 's' )
-      key = key.gsub( ':', '' )    # trailing :  ## fix: use regex /:$/ w/ anchor??
-      ## remove special chars ()+-/,'
-      key = key.gsub( /['()+\-\/,]/, ' ' )
-      key = key.strip
-      key = key.gsub( /[ ]+/, '_' )
-    end
-    key
-  end
-  def sect_to_hash( sect )
-    rows  = sect.css( 'table tr' )
-    cells = sect.css( 'table tr td' )
-    field_ids = rows.css( '#field' )    ## check - use div#field.category -- possible?
-    data_ids  = rows.css( '#data' )
-    logger.debug "rows.size:    #{rows.size}  (cells.size: #{cells.size} / field_ids.size: #{field_ids.size} / data_ids.size: #{data_ids.size})"
-    hash = {}
-    last_cat = nil
-    cells.each_with_index do |cell,i|
-      ## next if i > 14   ## skip after xx for debugging for now
-      # check if field or data id
-      # check for (nested) div#field in td
-      has_field_id  =  cell.css( '#field' ).size == 1 ? true : false
-      # check for td#data
-      has_data_id =  cell['id'] == 'data' ? true : false
-      if has_field_id
-        cats  = cell.css( 'div.category' )   ## note: ignore all .category not using div (issue warn/err if found!!) etc.
-        if cats.size == 1
-          text = cleanup_key( cats.first.text.strip )   # remove/strip leading and trailing spaces
-          last_cat = text
-          logger.debug "  [#{i}] category: >>#{text}<<"
-        else
-          logger.warn "**** !!!!!! warn/err - found element w/ field id  (no match for subsection!!! - check)"
-          logger.warn cell.to_s
-        end
-      elsif has_data_id
-        cats      = cell.css( 'div.category' )   ## note: ignore all .category not using div (issue warn/err if found!!) etc.
-        cats_data = cell.css( 'div.category_data,span.category_data' )  ## note: ignore a.category_data etc.
-        cats_div_data  =  cell.css( 'div.category_data' )
-        cats_span_data =  cell.css( 'span.category_data' )
-        logger.debug "    - [#{i}] data cell - cats: #{cats.size}, cats_data: #{cats_data.size} (cats_div_data: #{cats_div_data.size} / cats_span_data: #{cats_span_data.size})"
-        pairs = []
-        last_pair = nil
-        last_pair_data_count = 0
-        ## loop over div blocks (might be .category or .category_data)
-        cell.children.each_with_index do |child,j|
-           unless child.element?
-             ## puts "   **** !!!! skipping non-element type >#{child.type}<:"
-             ## puts child.to_s
-             next
-           end
-           unless child.name == 'div'
-             logger.warn "   **** !!! skipping non-div >#{child.name}<:"
-             logger.warn child.to_s
-             next
-           end
-           ### check if .category or .category_data
-           if child['class'] == 'category'
-              ## collect text for category; exclude element w/ class.category_data
-              text = ""
-              child.children.each do |subchild|
-                text << subchild.text.strip     unless subchild.element? && subchild['class'] == 'category_data'
-              end
-              text = cleanup_key( text )
-              value = child.css('span.category_data').text.strip
-              logger.debug "        -- category >>#{text}<<"
-              ## start new pair
-              last_pair = [ text, value ]
-              last_pair_data_count = 0
-              pairs << last_pair
-           elsif child['class'] == 'category_data'
-              logger.debug "        -- category_data"
-              text = child.text.strip
-              if last_pair.nil?
-                ## assume its the very first entry; use implied/auto-created category
-                last_pair = [ 'text', '' ]
-                last_pair_data_count = 0
-                pairs << last_pair
-              end
-              ### first category_data element?
-              if last_pair_data_count == 0
-                if last_pair[1] == ''
-                  last_pair[1] = text
-                else
-                  last_pair[1] += " #{text}"    ## append w/o separator
-                end
-              else
-                if last_cat == 'demographic_profile' || last_cat == 'Demographic profile'  ## special case (use space a sep)
-                  last_pair[1] += " #{text}"   ## append with separator
-                else
-                  last_pair[1] += "; #{text}"   ## append with separator
-                end
-              end
-              last_pair_data_count += 1
-           else
-              logger.warn "  **** !!! skipping div w/o category or category_data class:"
-              logger.warn child.to_s
-           end
-        end
-        ## pp pairs
-        ## pairs to hash
-        pairs_hash = {}
-        pairs.each do |pair|
-          pairs_hash[ pair[0] ] = pair[1]
-        end
-        hash[ last_cat ] = pairs_hash
-      else
-        logger.warn "#### !!!!  unknown cell type (no field or data id found):"
-        logger.warn cell.to_s
-      end
-    end # each cell
-    hash  # return hash
-  end # method sect_to_hash
-  end  # class Sect
-end # module Factbook
+# encoding: utf-8
+module Factbook
+class Sect
+  include LogUtils::Logging
+  attr_accessor :title        ## use name instead of title - why? why not?
+  attr_accessor :subsects
+  def initialize
+    @subsects = []
+  end
+  def data
+    ## convert sects to hash
+    @data = {}
+    subsects.each_with_index do |subsect,i|
+      @data[ subsect.title ] = subsect.data
+    end
+    @data
+  end
+end # class Sect
+end # module Factbook

data/lib/factbook/subsect.rb ADDED Viewed

@@ -0,0 +1,18 @@
+# encoding: utf-8
+module Factbook
+class Subsect
+  include LogUtils::Logging
+  attr_accessor :title        ## use name instead of title - why? why not?
+  attr_accessor :data         ## hash holding data e.g. { 'text' => '...' etc. }
+  def initialize
+    @data = {}
+  end
+end # class Subsect
+end # module Factbook

data/lib/factbook/table.rb ADDED Viewed

@@ -0,0 +1,52 @@
+# encoding: utf-8
+module Factbook
+##
+## make more "generic"  - why? why not?
+##   (re)use for other files ?? move to textutils ??
+##
+##  for now reads in rows with values separated by at least 3+ spaces e.g.:
+##   see www.cia.gov/library/publications/the-world-factbook/rankorder/rawdata_2119.txt
+## 1      China                      1,367,485,388
+## 2      India                      1,251,695,584
+## 3      European Union             513,949,445
+## 4      United States              321,368,864
+## 5      Indonesia                  255,993,674
+## 6      Brazil                     204,259,812
+class TableReader
+  include LogUtils::Logging
+def initialize( text )
+  @text = text
+end
+def read
+  recs = []
+  line_no = 0
+  @text.each_line do |line|
+    line_no +=1
+    line = line.strip   ## remove leading and trailing whitespace
+    if line.empty?
+      puts "** skipping empty line #{line_no}"
+      next
+    end
+    values = line.split( /[ ]{3,}/ )    ## split three or more spaces - use just two ?? why? why not??
+    ## puts line
+    ## pp values
+    recs << values
+  end
+  recs
+end
+end # class TableReader
+end # module Factbook

data/lib/factbook/utils.rb ADDED Viewed

@@ -0,0 +1,85 @@
+# encoding: utf-8
+module Factbook
+  module Utils
+########################################
+## todo: move to textutils - why, why not ?????
+def encode_utf8( text )
+  errors = []   ## also return list of encoding errors
+  ## note: factbook claims utf-8  - but includes invalid bytes in some pages
+  ##   encoding is likley wester/windows-
+  ## note:
+  ##   use �    - unknown/invalid unicode char
+  ##  fix/todo: use ASCII-8BIT instead of binnary
+  text = text.encode('UTF-8', 'binary', :invalid => :replace,
+                                        :undef   => :replace,
+                                        :replace => '�' )
+  ## check for replaced/invalid chars and log warrning
+  pos = text.index( '�' )
+  while pos
+    from = pos-10   ## tood/fix: use min/max to check for bounds - why? why not??
+    to   = pos+10
+    around = text[from..to]
+    puts "  pos #{pos}, from #{from}, to #{to}, around >#{around}<"
+    msg  = "invalid char on pos #{pos} around: >#{around}<"
+    puts msg
+    ## also log message / w timestamp
+    errors << "#{Time.now} - #{msg}"
+    pos = text.index( '�', pos+1 )
+  end
+  [text,errors]   ## return text and errors (list)
+end
+def values_to_csv( values )
+  buf = ""
+  values.each_with_index do |value,i|
+     buf << ','  if i > 0    ## add comma (except for first value)
+     ## note: allow optional $ sign e.g. $100,000,000
+     ##  !!!! todo/fix: allow optional minus e.g. -44,000
+     if value =~ /^\$?[1-9][,0-9]+[0-9]$/    ### find a better regex - why? why not??
+       ## check if number e.g. 17,098,242  or $17,098,242
+       ##   remove commas  17098242
+       buf << value.gsub( ',', '' )
+     elsif value.index( ',').nil?
+       ## add as is 1:1 (no commana)
+       buf << value
+     else
+       ## escape comma with double quote
+       #   e.g. Guam, The becomes "Guam, The"
+       buf << '"'
+       buf << value
+       buf << '"'
+     end
+  end
+  buf
+end
+def data_to_csv( recs, headers )
+  text = ""
+  text << values_to_csv( headers )
+  text << "\n"
+  recs.each do |rec|
+    text << values_to_csv( rec )
+    text << "\n"
+  end
+  text
+end
+  end   # module Utils
+end     # module Factbook

data/lib/factbook/utils_info.rb ADDED Viewed

@@ -0,0 +1,102 @@
+# encoding: utf-8
+module Factbook
+  module Utils
+#######
+## find meta data (about page info)
+#### e.g. Page last updated on September 16, 2015
+MONTH_EN_TO_S={
+  'January'   => '1',
+  'February'  => '2',
+  'March'     => '3',
+  'April'     => '4',
+  'May'       => '5',
+  'June'      => '6',
+  'July'      => '7',
+  'August'    => '8',
+  'September' => '9',
+  'October'   => '10',
+  'November'  => '11',
+  'December'  => '12'
+}
+PAGE_LAST_UPDATED_REGEX = /
+                           Page \s last \s updated \s on \s
+                            (?<month_en>[a-z]+) \s
+                            (?<day>\d{1,2}), \s
+                            (?<year>\d{4})
+                          /imx
+def find_page_last_updated( html )
+  m = PAGE_LAST_UPDATED_REGEX.match( html )
+  if m
+    pp m
+    month_en = m[:month_en]
+    day      = m[:day]
+    year     = m[:year]
+    puts "** bingo - month #{month_en}, day #{day}, year #{year}"
+    month = MONTH_EN_TO_S[ month_en ]
+    date_str = "#{year}-#{month}-#{day}"
+    pp date_str
+    date = Date.strptime( date_str, '%Y-%m-%d' )
+    date
+  else
+    nil
+  end
+end
+##
+## e.g. regioncode="eur"
+##      countrycode="au"
+##      countryname="Austria"
+##      flagsubfield=""
+##      countryaffiliation=""
+##      flagdescription=""
+##      flagdescriptionnote=""
+##      region="Europe"
+##
+##   note: countryaffiliation may be empty
+PAGE_INFO_REGEX = /
+             regioncode=(?<q1>"|')(?<region_code>.+?)\k<q1>
+               \s+
+             countrycode=(?<q2>"|')(?<country_code>.+?)\k<q2>       ## is k<3> backref
+               \s+
+              countryname=(?<q3>"|')(?<country>.+?)\k<q3>
+               \s+
+                [^>]+?  ## allow any attribs (note: non-greedy)
+              countryaffiliation=(?<q4>"|')(?<affiliation>.*?)\k<q4>     ## note: might be empty
+               \s+
+                [^>]+?  ## allow any attribs (note: non-greedy)
+              region=(?<q5>"|')(?<region>.+?)\k<q5>    ## check world - might be empty ?? or for ocean ??
+           /imx
+def find_page_info( html )
+  m = PAGE_INFO_REGEX.match( html )
+  if m
+    pp m
+    h = { country_code:        m[:country_code],
+          country_name:        m[:country],
+          country_affiliation: m[:affiliation],
+          region_code:         m[:region_code],
+          region_name:         m[:region] }
+    puts "** bingo - #{h.inspect}"
+    h    ## return hash w/ name-value pairs
+  else
+    nil   ## or return empty struct with nils/empty strings - why?? why not??
+  end
+end
+  end   # module Utils
+end     # module Factbook

data/lib/factbook/version.rb CHANGED Viewed

@@ -1,9 +1,10 @@
+# encoding: utf-8
 module Factbook
-  MAJOR = 0
-  MINOR = 1
-  PATCH = 3
+  MAJOR = 1
+  MINOR = 0
+  PATCH = 0
   VERSION = [MAJOR,MINOR,PATCH].join('.')
   def self.version

data/lib/factbook.rb CHANGED Viewed

@@ -7,6 +7,7 @@ require 'uri'
 require 'cgi'
 require 'pp'
 require 'json'
+require 'csv'
 require 'fileutils'
@@ -21,11 +22,32 @@ require 'nokogiri'
 # our own code
 require 'factbook/version' # let it always go first
+require 'factbook/utils'
+require 'factbook/utils_info'
+require 'factbook/sanitizer'
+require 'factbook/builder_item'
+require 'factbook/builder'
 require 'factbook/page'
 require 'factbook/sect'
+require 'factbook/subsect'
+require 'factbook/codes'
+require 'factbook/comparisons'
+require 'factbook/table'    ## e.g. TableReader
-puts Factbook.banner
+module Factbook
+  ##  auto-load builtin codes and comparisons
+  CODES       = Codes.from_csv( "#{Factbook.root}/data/codes.csv" )
+  COMPARISONS = Comparisons.from_csv( "#{Factbook.root}/data/comparisons.csv")
+  def self.codes()       CODES; end
+  def self.comparisons() COMPARISONS; end
+end # module Factbook
+puts Factbook.banner     if defined?($RUBYLIBS_DEBUG) && $RUBYLIBS_DEBUG