RubyGems - factbook-readers - Versions diffs - 0.0.1 - Mend

factbook-readers 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (58) hide show

checksums.yaml +7 -0
data/CHANGELOG.md +3 -0
data/Manifest.txt +56 -0
data/README.md +196 -0
data/Rakefile +34 -0
data/data/attributes.yml +337 -0
data/data/categories.csv +164 -0
data/data/codes.csv +262 -0
data/data/codesxref.csv +280 -0
data/data/comparisons.csv +75 -0
data/lib/factbook-readers.rb +59 -0
data/lib/factbook-readers/attributes.rb +74 -0
data/lib/factbook-readers/builder.rb +212 -0
data/lib/factbook-readers/builder_item.rb +185 -0
data/lib/factbook-readers/builder_json.rb +79 -0
data/lib/factbook-readers/codes.rb +122 -0
data/lib/factbook-readers/comparisons.rb +50 -0
data/lib/factbook-readers/counter.rb +48 -0
data/lib/factbook-readers/normalize.rb +43 -0
data/lib/factbook-readers/page.rb +148 -0
data/lib/factbook-readers/page_info.rb +12 -0
data/lib/factbook-readers/reader_json.rb +51 -0
data/lib/factbook-readers/sanitizer.rb +307 -0
data/lib/factbook-readers/sect.rb +29 -0
data/lib/factbook-readers/subsect.rb +18 -0
data/lib/factbook-readers/table.rb +52 -0
data/lib/factbook-readers/utils.rb +47 -0
data/lib/factbook-readers/utils_info.rb +129 -0
data/lib/factbook-readers/version.rb +24 -0
data/lib/factbook/readers.rb +5 -0
data/test/data/au.html +579 -0
data/test/data/au.yml +8 -0
data/test/data/be.html +596 -0
data/test/data/be.yml +8 -0
data/test/data/json/au.json +892 -0
data/test/data/src/ag.html +716 -0
data/test/data/src/au-2015-09-24.html +2006 -0
data/test/data/src/au.html +658 -0
data/test/data/src/be-2015-09-24.html +2011 -0
data/test/data/src/be.html +648 -0
data/test/helper.rb +11 -0
data/test/test_attribs.rb +87 -0
data/test/test_attribs_def.rb +20 -0
data/test/test_builder.rb +35 -0
data/test/test_codes.rb +76 -0
data/test/test_comparisons.rb +19 -0
data/test/test_convert.rb +30 -0
data/test/test_counter.rb +31 -0
data/test/test_fields.rb +52 -0
data/test/test_importer.rb +56 -0
data/test/test_item_builder.rb +99 -0
data/test/test_json.rb +45 -0
data/test/test_json_builder.rb +25 -0
data/test/test_normalize.rb +23 -0
data/test/test_page.rb +38 -0
data/test/test_sanitizer.rb +39 -0
data/test/test_sanitizer_regex.rb +89 -0
metadata +196 -0

data/data/comparisons.csv ADDED

@@ -0,0 +1,75 @@
+Num,Category,Name
+2147,Geography,Area
+2119,People and Society,Population
+2002,People and Society,Population growth rate
+2054,People and Society,Birth rate
+2066,People and Society,Death rate
+2112,People and Society,Net migration rate
+2223,People and Society,Maternal mortality rate
+2091,People and Society,Infant mortality rate
+2102,People and Society,Life expectancy at birth
+2127,People and Society,Total fertility rate
+2225,People and Society,Health expenditures
+2155,People and Society,HIV/AIDS - adult prevalence rate
+2156,People and Society,HIV/AIDS - people living with HIV/AIDS
+2157,People and Society,HIV/AIDS - deaths
+2228,People and Society,Obesity - adult prevalence rate
+2224,People and Society,Children under the age of 5 years underweight
+2206,People and Society,Education expenditures
+2229,People and Society,"Unemployment, youth ages 15-24"
+2001,Economy,GDP (purchasing power parity)
+2003,Economy,GDP - real growth rate
+2004,Economy,GDP - per capita (PPP)
+2260,Economy,Gross national saving
+2089,Economy,Industrial production growth rate
+2095,Economy,Labor force
+2129,Economy,Unemployment rate
+2172,Economy,Distribution of family income - Gini index
+2221,Economy,Taxes and other revenues
+2222,Economy,Budget surplus (+) or deficit (-)
+2186,Economy,Public debt
+2092,Economy,Inflation rate (consumer prices)
+2207,Economy,Central bank discount rate
+2208,Economy,Commercial bank prime lending rate
+2214,Economy,Stock of narrow money
+2215,Economy,Stock of broad money
+2211,Economy,Stock of domestic credit
+2200,Economy,Market value of publicly traded shares
+2187,Economy,Current account balance
+2078,Economy,Exports
+2087,Economy,Imports
+2188,Economy,Reserves of foreign exchange and gold
+2079,Economy,Debt - external
+2198,Economy,Stock of direct foreign investment - at home
+2199,Economy,Stock of direct foreign investment - abroad
+2232,Energy,Electricity - production
+2233,Energy,Electricity - consumption
+2234,Energy,Electricity - exports
+2235,Energy,Electricity - imports
+2236,Energy,Electricity - installed generating capacity
+2237,Energy,Electricity - from fossil fuels
+2239,Energy,Electricity - from nuclear fuels
+2238,Energy,Electricity - from hydroelectric plants
+2240,Energy,Electricity - from other renewable sources
+2241,Energy,Crude oil - production
+2242,Energy,Crude oil - exports
+2243,Energy,Crude oil - imports
+2244,Energy,Crude oil - proved reserves
+2245,Energy,Refined petroleum products - production
+2246,Energy,Refined petroleum products - consumption
+2247,Energy,Refined petroleum products - exports
+2248,Energy,Refined petroleum products - imports
+2249,Energy,Natural gas - production
+2250,Energy,Natural gas - consumption
+2251,Energy,Natural gas - exports
+2252,Energy,Natural gas - imports
+2253,Energy,Natural gas - proved reserves
+2150,Communications,Telephones - fixed lines
+2151,Communications,Telephones - mobile cellular
+2153,Communications,Internet users
+2053,Transportation,Airports
+2121,Transportation,Railways
+2085,Transportation,Roadways
+2093,Transportation,Waterways
+2108,Transportation,Merchant marine
+2034,Military,Military expenditures

data/lib/factbook-readers.rb ADDED

@@ -0,0 +1,59 @@
+## 3rd party gems/libs
+## require 'props'
+require 'logutils'
+require 'webget'
+require 'csvreader'
+require 'nokogiri'
+# our own code
+require 'factbook-readers/version' # let it always go first
+require 'factbook-readers/codes'
+require 'factbook-readers/comparisons'
+require 'factbook-readers/attributes'
+module Factbook
+  ##  auto-load builtin codes, comparisons, attributes, etc.
+  CODES       = Codes.from_csv( "#{Factbook::Module::Readers.root}/data/codes.csv" )
+  COMPARISONS = Comparisons.from_csv( "#{Factbook::Module::Readers.root}/data/comparisons.csv" )
+  ATTRIBUTES  = Attributes.from_yaml( "#{Factbook::Module::Readers.root}/data/attributes.yml" )
+  def self.codes()       CODES; end
+  def self.comparisons() COMPARISONS; end
+  def self.attributes()  ATTRIBUTES; end
+end # module Factbook
+## note: make codes, comparisons, attributes available
+require 'factbook-readers/utils'
+require 'factbook-readers/utils_info'
+require 'factbook-readers/sanitizer'
+require 'factbook-readers/normalize'
+require 'factbook-readers/builder_item'
+require 'factbook-readers/builder'
+require 'factbook-readers/builder_json'
+require 'factbook-readers/page'
+require 'factbook-readers/page_info'
+require 'factbook-readers/sect'
+require 'factbook-readers/subsect'
+require 'factbook-readers/reader_json'
+require 'factbook-readers/table'    ## e.g. TableReader
+require 'factbook-readers/counter'
+puts Factbook::Module::Readers.banner

data/lib/factbook-readers/attributes.rb ADDED

@@ -0,0 +1,74 @@
+# encoding: utf-8
+module Factbook
+class Attributes
+  Attribute = Struct.new( :name,
+                          :category,  ## e.g. Introduction, Geography, etc.
+                          :path,      ## note: is an array  e.g. ["Area - comparative"] or ["Area", "land"] etc.
+                        )
+  def self.from_yaml( path )
+    h = YAML.load_file( path )
+    pp h
+    attribs = []
+    ## note: use a copy (e.g. h.dup) for now (hash gets changed by build_attribs!!)
+    new_h = h.dup
+    new_h.each do |k,v|
+      category = k
+      build_attribs( attribs, category, [], v )
+    end
+    self.new( attribs )
+  end
+  def self.build_attribs( attribs, category, path, h )
+      ## assume it's an attribute definition hash
+      ##   note: !! exclude special cases:
+      ##      Capital           -- incl. name key itself
+      ##      National anthem
+     if h.has_key?( 'name' ) &&  ['Capital','National anthem'].include?( path[-1] ) == false
+       a = Attribute.new
+       a.name     = h['name']
+       a.category = category
+       a.path     = path
+       puts "  adding attribute >#{a.name}< using #{a.category} / #{a.path.inspect}"
+       attribs << a
+       ## note: make sure a modifable copy (of h) gets passed in
+       h.delete( 'name' )
+     end
+     return  if h.empty?    ## empty hash; nothing (more) to do; return
+     ## continue walking (recursive)
+     h.each do |k,v|
+       new_path = path.dup << k   ## note: create a new array (copy)
+       build_attribs( attribs, category, new_path, v )
+    end
+  end
+  def initialize( attribs )
+    @attribs = attribs
+  end
+  def to_a() @attribs; end
+  def size() @attribs.size; end
+  def each
+    @attribs.each { |attrib| yield( attrib ) }
+  end
+end  # class Attributes
+end # module Factbook

data/lib/factbook-readers/builder.rb ADDED

@@ -0,0 +1,212 @@
+# encoding: utf-8
+module Factbook
+class Builder     ## todo: change to PageBuilder ???
+  include LogUtils::Logging
+=begin
+def self.from_cc( cc, opts={} )  ## rename to from_file_for_country() or from_file_for_cc() or something - why?? why not??
+  ## check/todo: rename input_dir to just dir or to include ?
+  ##   (there's no output_dir)?? - why? why not?
+  input_dir = opts[:input_dir] || '.'
+  self.from_file( "#{input_dir}/#{cc}.html" )
+end
+=end
+def self.from_file( path )
+  html_ascii = File.read( path )    ## fix/todo: use ASCII8BIT/binary reader !!!!!
+  self.from_string( html_ascii )
+end
+def self.from_string( html_ascii )   ## note: expects ASCII-7BIT/BINARY encoding
+  self.new( html_ascii )
+end
+attr_reader :html_ascii,     ## full "original" 1:1 page in "original/ascii8/binary" encoding
+            :html,           ## utf-8 encoded profile
+            :html_debug,     ## html w/ mapping markers - rename to html_markers - why? why not?
+            :info,            ## page info incl. country_name, region_name, last_updated etc.
+            :errors,          ## encoding erros etc.
+            :sects
+def initialize( html_ascii )
+  @html_ascii = html_ascii
+  ## todo/fix: use/assume windows 12xx?? encoding - change encoding to utf-8  (from binary/ascii8bit)
+  @html, @info, @errors = Sanitizer.new.sanitize( @html_ascii )
+  html_sects =  if @html.empty?
+                   ## note: support "empty" pages - old format waiting for update!!!
+                   ##    cannot parse for now
+                   []  ## return empty (no) sections for now - sorry (its just one page with code cc anyway!!)
+                else
+                   @html_debug = map_sects( @html )
+                   @html_debug = map_subsects( @html_debug )
+                   split_sects( @html_debug )
+                end
+  pp html_sects
+  ## debug
+  ## File.open( 'tmp/br.debug.html', 'w:utf-8') { |f| f.write( @html_debug ) }
+  @sects = []
+  html_sects.each do |html_sect|
+    html_sect_head = html_sect[0]
+    html_subsects  = html_sect[1]
+    puts html_sect_head
+    puts html_subsects.size
+    ## get section title
+    ##  @SECTION{Economy}  => Economy
+    if html_sect_head =~ /@SECTION{(.+?)}/
+      title = $1.strip
+      puts title
+      sect = Sect.new
+      sect.title = title
+      ## get subsections
+      subsects = []
+      html_subsects.each do |html_subsect|
+        html_subsect_head = html_subsect[0]
+        html_subsect_body = html_subsect[1]
+        if html_subsect_head =~ /@SUBSECTION{(.+?)}/
+          title = $1.strip
+          title = title.sub( /:\z/, '' )    # remove trailing : if present
+          title = title.strip
+          puts title
+          subsect = Subsect.new
+          subsect.title = title     ## todo/fix: cut off trailing colon (:)
+          b = Factbook::ItemBuilder.new( html_subsect_body, title )
+          h = b.read
+          subsect.data = h
+          subsects << subsect
+        else
+          ## warn/fix: no subsection title found
+        end
+      end
+      sect.subsects = subsects
+      @sects << sect
+    else
+      ## warn/fix:  no section title found
+    end
+  end
+  self  ## return self -- needed?? default (standard) anyway?? check and remove
+end
+def map_sects( html )
+   ## convert section titles to "unified" marker
+   ## e.g.
+   ##   <h2>Introduction</h2>
+  title_regex= /<h2>
+                 \s*
+                   (.+?)  ## note: use non-greedy; do NOT allow tags inside for now
+                 \s*
+                <\/h2>
+              /xim
+  html = html.gsub( title_regex ) do |m|
+     puts "** found section >#{$1}<:"
+     puts "   >|#{m}|<"
+     "\n\n@SECTION{#{$1}}\n\n"
+  end
+  html
+end
+def map_subsects( html )
+   ## convert subsection titles to "unified" marker
+   ## e.g.
+   ##  <h3>Disputes - international:</h3>
+  title_regex= /<h3>
+                  \s*
+                   (.+?)                ## note: use non-greedy; allows tags inside - why? why not
+                  \s*
+                 <\/h3>
+               /xim
+  html = html.gsub( title_regex ) do |m|
+     puts "** found subsection >#{$1}<:"
+     puts "   >|#{m}|<"
+     "\n@SUBSECTION{#{$1}}\n"
+  end
+  html
+end
+def split_sects( html )
+  ####
+  #  split html in sections (divided by section headings)
+  #  e.g. remove optional prolog ??,
+  ##   [[heading,sect],
+  ##    [heading,sect],
+  ##    [heading,sect],...]
+  ## note: "wrap" regex in a capture group (just one)
+  ##   String#split will include all catpure groups in the result array
+  section_regex= /(@SECTION{.+?})/  ## note: use non-greedy -- check: need to escape {} ??
+  chunks = html.split( section_regex )
+  ## check if first item is a section or (html) prolog
+  #   if prolog (remove)
+  chunks.slice!(0)  unless chunks[0] =~ /@SECTION/  ## starts w/ @SECTION
+  pairs = chunks.each_slice(2).to_a
+  ## now split subsections
+  newpairs = []
+  pairs.each do |item|
+    ## todo: after cleanup prolog; remove @SECTION{} ?? - just keep title - why, why not??
+    newpairs << [item[0], split_subsects( item[1]) ]
+  end
+  newpairs
+end
+def split_subsects( html )
+  ####
+  #  split html in subsections (divided by subsection headings)
+  #  e.g. remove optional prolog ??,
+  ##   [[heading,sect],
+  ##    [heading,sect],
+  ##    [heading,sect],...]
+  ## note: "wrap" regex in a capture group (just one)
+  ##   String#split will include all catpure groups in the result array
+  subsection_regex= /(@SUBSECTION{.+?})/  ## note: use non-greedy -- check: need to escape {} ??
+  chunks = html.split( subsection_regex )
+  ## check if first item is a section or (html) prolog
+  #   if prolog (remove)
+  chunks.slice!(0)  unless chunks[0] =~ /@SUBSECTION/  ## starts w/ @SUBSECTION
+  pairs = chunks.each_slice(2).to_a
+  pairs
+end
+end # class Builder
+end # module Factbook

data/lib/factbook-readers/builder_item.rb ADDED

@@ -0,0 +1,185 @@
+# encoding: utf-8
+module Factbook
+class ItemBuilder       ## renameto ItemReader, ItemParser - why? why not??
+  include LogUtils::Logging
+  include NormalizeHelper    ##  e.g. normalize_category
+def initialize( html, name )
+  @html = html
+  @name = name     # add category/field name e.g. Area, Location, etc.
+end
+##
+## <div class="category_data subfield text">
+## Portuguese  (official and most widely spoken language)
+##
+## </div>
+## <div class="category_data note">
+## <p><strong>note:</strong> less common languages include Spanish (border areas and schools), German, Italian, Japanese, English, and a large number of minor Amerindian languages</p>
+## </div>
+def read
+  ## return hash from html snippet
+  doc = Nokogiri::HTML.fragment( @html )
+  data = {}
+  ## note:
+  ##   skip whitespace text nodes (e.g. \n\n etc); just use divs
+  doc_children = doc.children.filter('div')
+  puts "  parsing >#{@name}< - #{doc_children.size} category_data divs(s):"
+  ## hanlde special case for
+  ##  multiple 'grouped_subfield' first
+  ##  e.g. used in
+  ##   - Drinking water source:
+  ##   - Sanitation facility access:
+  grouped_children = []
+  other_children   = []
+  doc_children.each do |div|
+     if div['class'].index( 'grouped_subfield' )
+        grouped_children << div
+     else
+        other_children << div
+     end
+  end
+  ## note: only use special rule if more than one div marked grouped_
+  if grouped_children.size > 1
+    ## continue processing the rest as usual
+    doc_children =  other_children
+    key = nil
+    grouped_children.each do |div|
+       if !div.css( 'span.subfield-group').empty?
+         # start a new group
+         span_group = div.at( 'span.subfield-group')
+         key  = normalize_category( span_group.text.strip )
+         span_group.replace( '' )
+         text = squish( div.text.strip )
+         puts "new group - category_data key >#{key}<: >#{text}<"
+         data[ key ] = { 'text' => text }
+       else
+         ## append to (last) group
+         text = squish( div.text.strip )
+         puts "add group - category_data key >#{key}<: >#{text}<"
+         data[ key ]['text'] += " / #{text}"
+       end
+    end
+  end
+  doc_children.each_with_index do |div,i|
+    if div['class'].index( 'note' )
+      text = squish( div.text.strip )
+      puts "category_data: >#{text}<"
+      ## note: for now only allow one note per subsection/field data block
+      if data['note']
+        puts "!! ERROR: note already taken:"
+        puts data['note']
+        puts  div.to_html
+        exit 1
+      end
+      data['note'] = { 'text' => text }
+    elsif div['class'].index( 'historic' )
+      ## add all historic together into one for now
+        text = squish( div.text.strip )
+        puts "category_data: >#{text}<"
+        if data['text']
+          ## append with / for now
+          data['text'] += " / #{text}"
+        else
+          data['text'] = text
+          ## check if history is first node
+          if i != 0
+            puts "!! ERROR: expected first historic node to be first node but it is #{i+1}:"
+            puts div.to_html
+            exit 1
+          end
+        end
+      elsif div.css( 'span.subfield-name').empty?
+        ## assume "implied text field"
+        ## check for index == 1 / child count == 1 - why? why not
+        text = squish( div.text.strip )    ## fix/todo: use strip
+        puts "category_data: >#{text}<"
+        data['text'] = text
+        ## must be always first node for now
+        if i != 0
+          puts "!! ERROR - 'implied' category W/O name NOT first div / node:"
+          puts div.to_html
+          exit 1
+        end
+    elsif div['class'].index( 'grouped_subfield' )
+## split grouped subfield!!
+##   <span class="subfield-name">arable land:</span>
+## <span class="subfield-number">8.6%</span>
+## <span class="subfield-date">(2011 est.)</span>
+##  /
+## <span class="subfield-name">permanent crops:</span>
+## <span class="subfield-number">0.8%</span>
+## <span class="subfield-date">(2011 est.)</span>
+##   /
+## <span class="subfield-name">permanent pasture:</span>
+## <span class="subfield-number">23.5%</span>
+## <span class="subfield-date">(2011 est.)</span>
+## join names for now - why? why not?
+##  e.g. becomes:
+##   arable land / permanent crops / permanent pasture: for key ??
+     span_names = div.css( 'span.subfield-name')
+     keys = []
+     span_names.each do |span|
+       keys << normalize_category( span.text.strip )
+       span.replace( '' )
+     end
+     key = keys.join( ' / ')
+     text = squish( div.text.strip )
+     puts "category_data key >#{key}<: >#{text}<"
+     data[ key ] = { 'text' => text }
+    else
+      ## get subfield name
+      span_names = div.css( 'span.subfield-name')
+      if span_names.size > 1
+        puts "!! ERROR - found more than one subfield-name:"
+        puts div.to_html
+        exit 1
+      end
+      key = normalize_category( span_names[0].text.strip )
+      span_names[0].replace( '' )
+      text = squish( div.text.strip )
+      puts "category_data key >#{key}<: >#{text}<"
+      data[ key ] = { 'text' => text }
+    end
+  end
+  pp data
+  data
+end
+def squish( str )
+  str.gsub( /[ \t\n\r]{2,}/, ' ')  ## replace multi-spaces (incl. newlines with once space)
+end
+end # class ItemBuilder
+end # module Factbook