RubyGems - factbook - Versions diffs - 0.1.3 → 1.0.0 - Mend

factbook 0.1.3 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (46) hide show

checksums.yaml +7 -0
data/Manifest.txt +34 -22
data/README.md +8 -3
data/Rakefile +2 -263
data/data/codes.csv +262 -0
data/data/comparisons.csv +75 -0
data/lib/factbook/builder.rb +214 -0
data/lib/factbook/builder_item.rb +93 -0
data/lib/factbook/codes.rb +119 -0
data/lib/factbook/comparisons.rb +50 -0
data/lib/factbook/page.rb +103 -303
data/lib/factbook/sanitizer.rb +214 -0
data/lib/factbook/sect.rb +29 -196
data/lib/factbook/subsect.rb +18 -0
data/lib/factbook/table.rb +52 -0
data/lib/factbook/utils.rb +85 -0
data/lib/factbook/utils_info.rb +102 -0
data/lib/factbook/version.rb +4 -3
data/lib/factbook.rb +23 -1
data/test/data/au.html +579 -0
data/test/data/au.yml +8 -0
data/test/data/be.html +596 -0
data/test/data/be.yml +8 -0
data/test/data/src/au.html +2006 -0
data/test/data/src/be.html +2011 -0
data/test/helper.rb +0 -4
data/test/test_builder.rb +37 -0
data/test/test_codes.rb +76 -0
data/test/test_comparisons.rb +19 -0
data/test/test_fields.rb +21 -18
data/test/test_item_builder.rb +99 -0
data/test/test_json.rb +17 -20
data/test/test_page.rb +18 -10
data/test/test_sanitizer.rb +35 -0
metadata +68 -49
data/.gemtest +0 -0
data/test/data/countrytemplate_au.html +0 -4179
data/test/data/countrytemplate_be.html +0 -4260
data/test/data/countrytemplate_br.html +0 -4366
data/test/data/countrytemplate_ee.html +0 -2999
data/test/data/countrytemplate_ls.html +0 -2728
data/test/data/countrytemplate_mx.html +0 -4397
data/test/data/countrytemplate_vt.html +0 -1726
data/test/data/countrytemplate_xx.html +0 -2898
data/test/test_page_old.rb +0 -478
data/test/test_strip.rb +0 -66

data/data/comparisons.csv ADDED Viewed

@@ -0,0 +1,75 @@
+Num,Category,Name
+2147,Geography,Area
+2119,People and Society,Population
+2002,People and Society,Population growth rate
+2054,People and Society,Birth rate
+2066,People and Society,Death rate
+2112,People and Society,Net migration rate
+2223,People and Society,Maternal mortality rate
+2091,People and Society,Infant mortality rate
+2102,People and Society,Life expectancy at birth
+2127,People and Society,Total fertility rate
+2225,People and Society,Health expenditures
+2155,People and Society,HIV/AIDS - adult prevalence rate
+2156,People and Society,HIV/AIDS - people living with HIV/AIDS
+2157,People and Society,HIV/AIDS - deaths
+2228,People and Society,Obesity - adult prevalence rate
+2224,People and Society,Children under the age of 5 years underweight
+2206,People and Society,Education expenditures
+2229,People and Society,"Unemployment, youth ages 15-24"
+2001,Economy,GDP (purchasing power parity)
+2003,Economy,GDP - real growth rate
+2004,Economy,GDP - per capita (PPP)
+2260,Economy,Gross national saving
+2089,Economy,Industrial production growth rate
+2095,Economy,Labor force
+2129,Economy,Unemployment rate
+2172,Economy,Distribution of family income - Gini index
+2221,Economy,Taxes and other revenues
+2222,Economy,Budget surplus (+) or deficit (-)
+2186,Economy,Public debt
+2092,Economy,Inflation rate (consumer prices)
+2207,Economy,Central bank discount rate
+2208,Economy,Commercial bank prime lending rate
+2214,Economy,Stock of narrow money
+2215,Economy,Stock of broad money
+2211,Economy,Stock of domestic credit
+2200,Economy,Market value of publicly traded shares
+2187,Economy,Current account balance
+2078,Economy,Exports
+2087,Economy,Imports
+2188,Economy,Reserves of foreign exchange and gold
+2079,Economy,Debt - external
+2198,Economy,Stock of direct foreign investment - at home
+2199,Economy,Stock of direct foreign investment - abroad
+2232,Energy,Electricity - production
+2233,Energy,Electricity - consumption
+2234,Energy,Electricity - exports
+2235,Energy,Electricity - imports
+2236,Energy,Electricity - installed generating capacity
+2237,Energy,Electricity - from fossil fuels
+2239,Energy,Electricity - from nuclear fuels
+2238,Energy,Electricity - from hydroelectric plants
+2240,Energy,Electricity - from other renewable sources
+2241,Energy,Crude oil - production
+2242,Energy,Crude oil - exports
+2243,Energy,Crude oil - imports
+2244,Energy,Crude oil - proved reserves
+2245,Energy,Refined petroleum products - production
+2246,Energy,Refined petroleum products - consumption
+2247,Energy,Refined petroleum products - exports
+2248,Energy,Refined petroleum products - imports
+2249,Energy,Natural gas - production
+2250,Energy,Natural gas - consumption
+2251,Energy,Natural gas - exports
+2252,Energy,Natural gas - imports
+2253,Energy,Natural gas - proved reserves
+2150,Communications,Telephones - fixed lines
+2151,Communications,Telephones - mobile cellular
+2153,Communications,Internet users
+2053,Transportation,Airports
+2121,Transportation,Railways
+2085,Transportation,Roadways
+2093,Transportation,Waterways
+2108,Transportation,Merchant marine
+2034,Military,Military expenditures

data/lib/factbook/builder.rb ADDED Viewed

@@ -0,0 +1,214 @@
+# encoding: utf-8
+module Factbook
+class Builder     ## todo: change to PageBuilder ???
+  include LogUtils::Logging
+=begin
+def self.from_cc( cc, opts={} )  ## rename to from_file_for_country() or from_file_for_cc() or something - why?? why not??
+  ## check/todo: rename input_dir to just dir or to include ?
+  ##   (there's no output_dir)?? - why? why not?
+  input_dir = opts[:input_dir] || '.'
+  self.from_file( "#{input_dir}/#{cc}.html" )
+end
+=end
+def self.from_file( path )
+  html_ascii = File.read( path )    ## fix/todo: use ASCII8BIT/binary reader !!!!!
+  self.new( html_ascii )
+end
+attr_reader :html_ascii,     ## full "original" 1:1 page in "original/ascii8/binary" encoding
+            :html,           ## utf-8 encoded profile
+            :html_debug,     ## html w/ mapping markers - rename to html_markers - why? why not?
+            :page_info,      ## incl. country_name, region_name, last_updated etc.
+            :errors,          ## encoding erros etc.
+            :page
+def initialize( html_ascii )
+  @html_ascii = html_ascii
+  ## todo/fix: use/assume windows 12xx?? encoding - change encoding to utf-8  (from binary/ascii8bit)
+  @html, @page_info, @errors = Sanitizer.new.sanitize( @html_ascii )
+  @html_debug = map_sects( @html )
+  @html_debug = map_subsects( @html_debug )
+  html_sects = split_sects( @html_debug )
+  pp html_sects
+  page = Page.new
+  sects = []
+  html_sects.each do |html_sect|
+    html_sect_head = html_sect[0]
+    html_subsects  = html_sect[1]
+    puts html_sect_head
+    puts html_subsects.size
+    ## get section title
+    ##  @SECTION{Economy}  => Economy
+    if html_sect_head =~ /@SECTION{(.+?)}/
+      title = $1.strip
+      puts title
+      sect = Sect.new
+      sect.title = title
+      ## get subsections
+      subsects = []
+      html_subsects.each do |html_subsect|
+        html_subsect_head = html_subsect[0]
+        html_subsect_body = html_subsect[1]
+        if html_subsect_head =~ /@SUBSECTION{(.+?)}/
+          title = $1.strip
+          title = title.sub( /:\z/, '' )    # remove trailing : if present
+          title = title.strip
+          puts title
+          subsect = Subsect.new
+          subsect.title = title     ## todo/fix: cut off trailing colon (:)
+          b = Factbook::ItemBuilder.new( html_subsect_body, title )
+          h = b.read
+          subsect.data = h
+          subsects << subsect
+        else
+          ## warn/fix: no subsection title found
+        end
+      end
+      sect.subsects = subsects
+      sects << sect
+    else
+      ## warn/fix:  no section title found
+    end
+  end
+  page.sects = sects
+  @page = page
+  pp page
+  self  ## return self -- needed?? default (standard) anyway?? check and remove
+end
+def map_sects( html )
+   ## convert section titles
+   ##   from  <h2>..</h2>
+   ##   to "unified" marker
+  ## e.g.
+  ##  <h2 sectiontitle='Introduction' ccode='au'>Introduction ::  <span class='region'>AUSTRIA </span></h2>
+  ##  <h2>Introduction</h2>
+  title_regex= /<h2
+                 (?:\s[^>]+)?  ## allow optional attributes in h2
+                 >
+                 \s*
+                   ([^<>]+?)  ## note: use non-greedy; do NOT allow tags inside for now
+                 \s*
+                 (?:\s::\s
+                   .+?       ## note: use non-greedy; allows tags inside
+                 )?          ## strip optional name (e.g.  :: AUSTRIA)
+                <\/h2>
+              /xim
+  html = html.gsub( title_regex ) do |m|
+     puts "** found section >#{$1}<:"
+     puts "   >|#{m}|<"
+     "\n\n@SECTION{#{$1}}\n\n"
+  end
+  html
+end
+def map_subsects( html )
+   ## convert subsection titles
+   ##   from  <div id='field'>..</div>
+   ##   to "unified" marker
+  ## e.g.
+  ##  <div id='field' class='category'>Disputes - international:</div>
+  title_regex= /<div \s id='field'
+                     \s class='category'>
+                   \s*
+                   (.+?)                ## note: use non-greedy; allows tags inside - why? why not
+                   \s*
+                 <\/div>
+               /xim
+  html = html.gsub( title_regex ) do |m|
+     puts "** found subsection >#{$1}<:"
+     puts "   >|#{m}|<"
+     "\n@SUBSECTION{#{$1}}\n"
+  end
+  html
+end
+def split_sects( html )
+  ####
+  #  split html in sections (divided by section headings)
+  #  e.g. remove optional prolog ??,
+  ##   [[heading,sect],
+  ##    [heading,sect],
+  ##    [heading,sect],...]
+  ## note: "wrap" regex in a capture group (just one)
+  ##   String#split will include all catpure groups in the result array
+  section_regex= /(@SECTION{.+?})/  ## note: use non-greedy -- check: need to escape {} ??
+  chunks = html.split( section_regex )
+  ## check if first item is a section or (html) prolog
+  #   if prolog (remove)
+  chunks.slice!(0)  unless chunks[0] =~ /@SECTION/  ## starts w/ @SECTION
+  pairs = chunks.each_slice(2).to_a
+  ## now split subsections
+  newpairs = []
+  pairs.each do |item|
+    ## todo: after cleanup prolog; remove @SECTION{} ?? - just keep title - why, why not??
+    newpairs << [item[0], split_subsects( item[1]) ]
+  end
+  newpairs
+end
+def split_subsects( html )
+  ####
+  #  split html in subsections (divided by subsection headings)
+  #  e.g. remove optional prolog ??,
+  ##   [[heading,sect],
+  ##    [heading,sect],
+  ##    [heading,sect],...]
+  ## note: "wrap" regex in a capture group (just one)
+  ##   String#split will include all catpure groups in the result array
+  subsection_regex= /(@SUBSECTION{.+?})/  ## note: use non-greedy -- check: need to escape {} ??
+  chunks = html.split( subsection_regex )
+  ## check if first item is a section or (html) prolog
+  #   if prolog (remove)
+  chunks.slice!(0)  unless chunks[0] =~ /@SUBSECTION/  ## starts w/ @SUBSECTION
+  pairs = chunks.each_slice(2).to_a
+  pairs
+end
+end # class Builder
+end # module Factbook

data/lib/factbook/builder_item.rb ADDED Viewed

@@ -0,0 +1,93 @@
+# encoding: utf-8
+module Factbook
+class ItemBuilder       ## renameto ItemReader, ItemParser - why? why not??
+  include LogUtils::Logging
+def initialize( html, name )
+  @html = html
+  @name = name     # add category/field name e.g. Area, Location, etc.
+end
+def read
+  ## return hash from html snippet
+  doc = Nokogiri::HTML.fragment( @html )
+  data = {}
+  last_node = nil     ## track last hash (always use text key)
+  last_node_data_count = 0
+  ## note:
+  ##   skip whitespace text nodes (e.g. \n\n etc); just use divs
+  doc.children.filter('div').each_with_index do |child,i|
+    if child['class'] == 'category_data'
+       text = child.text    ## fix/todo: use strip
+       puts "category_data: >#{text}<"
+       if last_node.nil?
+          ## assume its the very first entry; use implied/auto-created category
+          data['text'] = ''
+          last_node = data
+          last_node_data_count = 0
+       end
+       ### first category_data element?
+      if last_node_data_count == 0
+         if last_node['text'] == ''
+            last_node['text'] = text
+         else   ### possible ??? if data_count is zero - not should not include any data
+            ## todo: issue warning here - why? why not??
+            last_node['text'] += " #{text}"    ## append w/o separator
+         end
+      else
+        if @name == 'demographic_profile' || @name == 'Demographic profile'  ## special case (use space a sep)
+            last_node['text'] += " #{text}"   ## append without (w/o) separator
+        else
+            last_node['text'] += " ++ #{text}"   ## append with ++ separator
+        end
+      end
+      last_node_data_count += 1
+    elsif child['class'].nil?    ## div without any class e.g. <div>..</div>
+                                 ##   assume category and category_data pair w/ spans
+      spans = child.children.filter('span')
+      if spans.size > 2
+        puts "*** warn: expected two (or one) spans; got #{spans.inspect}"
+      end
+      ## pp spans
+      span_key   = spans[0]  ## assume 1st entry is span.category
+      span_value = spans[1]  ## assume 2nd entry is span.category_data')
+      ## allow optional category_data for now
+      key   = span_key.text
+      key   = key.strip
+      key   = key.sub( /:\z/, '' )    # remove trailing : if present
+      key   = key.strip
+      value = span_value ? span_value.text : nil
+      puts "key: >#{key}<, value: >#{value}< : #{value.class.name}"
+      ## start new pair
+      last_node = data[key] = { 'text' => value }
+      last_node_data_count =  value ? 1 : 0    ## note: set to 1 if value present
+    else
+      puts "*** warn: item builder -- unknow css class in #{child.inspect}"
+    end
+    ## pp child
+    ## css = child['class']
+    ## puts "[#{i}] #{child.name}  class='>#{css}< : #{css.class.name}' >#{child.text}<"
+  end
+  pp data
+  data
+end
+end # class ItemBuilder
+end # module Factbook

data/lib/factbook/codes.rb ADDED Viewed

@@ -0,0 +1,119 @@
+# encoding: utf-8
+##
+# note:
+#   the factbook category/region for world is other entities (on FAQ) and oceans in page
+#    changed to world
+module Factbook
+class Codes
+  Code = Struct.new( :code,      ## todo: add notes (country affiliation) - why? why not??
+                     :name,
+                     :category,  ## e.g. Countries, Other, Oceans, World, Dependencies, etc.
+                     :region,    ## e.g. Europe, Oceans, etc.
+                    )
+  def self.from_csv( path )
+    ###
+    #  note:
+    #   if you use quotes - NO leading spaces allowed e.g.
+    #  use au,"Austria",... and NOT
+    #      au, "Austria", ...
+    #
+    #  for headers - NO leading spaces allowed e.g.
+    #   use  Code,Name,Category,Region,...   and NOT
+    #        Code, Name, Category, Region, ...
+    rows = CSV.read( path, headers: true )
+    pp rows
+    recs = []
+    rows.each do |row|
+      pp row
+      rec = Code.new
+      rec.code     = row['Code'].strip    ## remove leading n trailing whitespaces
+      rec.name     = row['Name'].strip
+      ## note: for now category and region are optional
+      rec.category = row['Category'].strip    if row['Category']
+      rec.region   = row['Region'].strip      if row['Region']
+      pp rec
+      recs << rec
+    end
+    self.new( recs )
+  end
+  def initialize( codes )
+    @codes = codes
+  end
+  def size() @codes.size; end
+  def each
+    @codes.each {|code| yield( code ) }
+  end
+  def to_a
+    @codes.collect {|code| code.code }   ## return array of codes
+  end
+  ##  def all()  self.to_a; end    ## note: alias for to_a - use - why? why not??
+  ## "pre-defined" convenience shortcuts
+  def countries()       category 'Countries';     end
+  def world()           category 'World';         end
+  def oceans()          category 'Oceans';        end
+  def misc()            category 'Miscellaneous'; end
+  def others()          category 'Other';         end
+  def dependencies()    category 'Dependencies';  end
+  def dependencies_us() category 'Dependencies (United States)'; end
+## fix/todo: add all dependencies  uk (or gb?), fr,cn,au,nz,no,dk,etc.
+  def europe()               region 'Europe';            end
+  def south_asia()           region 'South Asia';        end
+  def central_asia()         region 'Central Asia';      end
+  def east_n_souteast_asia() region 'East & Southeast Asia'; end
+  def middle_east()          region 'Middle East';       end
+  def africa()               region 'Africa';            end
+  def north_america()        region 'North America';     end
+  def central_america_n_caribbean() region 'Central America and Caribbean'; end
+  def south_america()        region 'South America';     end
+  def australia_oceania()    region 'Australia-Oceania'; end
+  def antartica()            region 'Antarctica';        end
+  ## note: regions oceans and world - same as category oceans and world
+  ##     use oceans_ii or world_ii or something ??
+  ##   use category('World')  n region('World')
+  ##   use category('Oceans') n region('Oceans')
+  def category( query )
+    ## todo/future: allow passing in of regex too (not just string)
+    ## note: e.g. Dependencies (France) needs to get escpaed to
+    ##            Dependencies \(France\)  etc.
+    filter_regex = /#{Regexp.escape(query)}/i
+    codes = @codes.select do |code|
+      code.category ? filter_regex.match( code.category ) : false   ## note: allow nil for category; will fail on search
+    end
+    Codes.new( codes )   ## return new Codes obj for easy-chaining
+  end
+  def region( query )
+    ## todo/future: allow passing in of regex too (not just string)
+    filter_regex = /#{Regexp.escape(query)}/i
+    codes = @codes.select do |code|
+       code.region ? filter_regex.match( code.region ) : false      ## note: allow nil for region; will fail on search
+    end
+    Codes.new( codes )   ## return new Codes obj for easy-chaining
+  end
+end  # class codes
+end # module Factbook

data/lib/factbook/comparisons.rb ADDED Viewed

@@ -0,0 +1,50 @@
+# encoding: utf-8
+module Factbook
+class Comparisons
+  Comparison = Struct.new( :num,        ### todo: use no or id or something - why? why not?
+                           :category,  ## e.g. Geography, People, Economy, etc.
+                           :name,
+                          )
+  def self.from_csv( path )
+    rows = CSV.read( path, headers: true )
+    pp rows
+    recs = []
+    rows.each do |row|
+      pp row
+      rec = Comparison.new
+      rec.num      = row['Num'].strip.to_i    ## remove leading n trailing whitespaces
+      rec.category = row['Category'].strip
+      rec.name     = row['Name'].strip
+      pp rec
+      recs << rec
+    end
+    self.new( recs )
+  end
+  def initialize( comps )
+    @comps = comps
+  end
+  def size() @comps.size; end
+  def each
+    @comps.each {|comp| yield( comp ) }
+  end
+  def to_a
+    @comps.collect {|comp| comp.num }   ## return array of nums   -- return something else - why? why not?
+  end
+end  # class Comparison
+end # module Factbook