RubyGems - factbook-readers - Versions diffs - 0.0.1 → 1.0.0 - Mend

factbook-readers 0.0.1 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (42) hide show

checksums.yaml +4 -4
data/Manifest.txt +0 -16
data/README.md +13 -14
data/data/codes.csv +1 -1
data/lib/factbook-readers.rb +11 -12
data/lib/factbook-readers/builder.rb +28 -53
data/lib/factbook-readers/builder_json.rb +9 -20
data/lib/factbook-readers/codes.rb +3 -2
data/lib/factbook-readers/comparisons.rb +2 -2
data/lib/factbook-readers/page.rb +59 -85
data/lib/factbook-readers/sanitizer.rb +13 -34
data/lib/factbook-readers/version.rb +2 -2
data/test/helper.rb +1 -5
data/test/test_builder.rb +1 -6
data/test/test_codes.rb +5 -9
data/test/test_comparisons.rb +2 -5
data/test/test_counter.rb +4 -6
data/test/test_fields.rb +0 -2
data/test/test_item_builder.rb +7 -9
data/test/test_json.rb +1 -3
data/test/test_json_builder.rb +1 -3
data/test/test_normalize.rb +0 -2
data/test/test_page.rb +2 -4
data/test/test_sanitizer.rb +2 -5
data/test/test_sanitizer_regex.rb +0 -2
metadata +2 -18
data/data/attributes.yml +0 -337
data/lib/factbook-readers/attributes.rb +0 -74
data/test/data/au.html +0 -579
data/test/data/au.yml +0 -8
data/test/data/be.html +0 -596
data/test/data/be.yml +0 -8
data/test/data/json/au.json +0 -892
data/test/data/src/ag.html +0 -716
data/test/data/src/au-2015-09-24.html +0 -2006
data/test/data/src/au.html +0 -658
data/test/data/src/be-2015-09-24.html +0 -2011
data/test/data/src/be.html +0 -648
data/test/test_attribs.rb +0 -87
data/test/test_attribs_def.rb +0 -20
data/test/test_convert.rb +0 -30
data/test/test_importer.rb +0 -56

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: dc307d415f957d373118337b72baa5ca9c0b5686
-  data.tar.gz: f5241909514a895878e29b1e9e3dd0f3fddf9088
+  metadata.gz: d9bc3eaf2cb6fa3774e0b7a25b53336be2b05a55
+  data.tar.gz: 86565dc92913645110beec803d7bc0a7c088155f
 SHA512:
-  metadata.gz: cd89c3f31089bb3256969076a69fe2dcc3e04f762033cec14e621d5bbff289f72aecd4f6a67e0c32b864588c56f3a57e8059f9957f885293875a95695dd22059
-  data.tar.gz: 5f55a31397cbfa186cb85597ec280a2fc94fe63e8b595ccf18e36e2168dff531e19d0a64a63281fba380356df23b56769318aecf1d84c52277e511e5cb21998f
+  metadata.gz: 755b8727d0bbcaecd97f52064b1b29321e1b59a72bff55bbdd995ed8968732def7480f4cab0f222bf4c9d163afbd5230647237b96d41dc136006f0f9a9473550
+  data.tar.gz: 638dcf4f4a552c705a743c9e7483e457303d2090a9204ee3d4b390c3256b537050ea4fdc0957cba461dcee549ecf52b541c4b691dcb8c500c4439eaf376d4a87

data/Manifest.txt CHANGED

@@ -2,13 +2,11 @@ CHANGELOG.md
 Manifest.txt
 README.md
 Rakefile
-data/attributes.yml
 data/categories.csv
 data/codes.csv
 data/codesxref.csv
 data/comparisons.csv
 lib/factbook-readers.rb
-lib/factbook-readers/attributes.rb
 lib/factbook-readers/builder.rb
 lib/factbook-readers/builder_item.rb
 lib/factbook-readers/builder_json.rb
@@ -27,26 +25,12 @@ lib/factbook-readers/utils.rb
 lib/factbook-readers/utils_info.rb
 lib/factbook-readers/version.rb
 lib/factbook/readers.rb
-test/data/au.html
-test/data/au.yml
-test/data/be.html
-test/data/be.yml
-test/data/json/au.json
-test/data/src/ag.html
-test/data/src/au-2015-09-24.html
-test/data/src/au.html
-test/data/src/be-2015-09-24.html
-test/data/src/be.html
 test/helper.rb
-test/test_attribs.rb
-test/test_attribs_def.rb
 test/test_builder.rb
 test/test_codes.rb
 test/test_comparisons.rb
-test/test_convert.rb
 test/test_counter.rb
 test/test_fields.rb
-test/test_importer.rb
 test/test_item_builder.rb
 test/test_json.rb
 test/test_json_builder.rb

data/README.md CHANGED

@@ -55,36 +55,35 @@ resulting in:
     ...
 ```
-### Use shortcut attribute accessors
+### Use data attributes
 ```ruby
-pp page.background        ## same as page['Introduction']['Background']['text']
+pp page['Introduction']['Background']['text']
 # => "Following more than three centuries..."
-pp page.area              ## same as page['Geography'][''Area']['total']['text']
+pp page['Geography']['Area']['total']['text']
 # => "8,515,770 sq km"
-pp page.area_land         ## same as page['Geography'][''Area']['land']['text']
+pp page['Geography']['Area']['land']['text']
 # => "8,358,140 sq km"
-pp page.area_water        ## same as page['Geography'][''Area']['water']['text']
+pp page['Geography']['Area']['water']['text']
 # => "157,630 sq km"
-pp page.area_note         ## same as page['Geography'][''Area']['note']['text']
+pp page['Geography']['Area']['note']['text']
 # => "includes Arquipelago de Fernando de Noronha, Atol das Rocas, ..."
-pp page.area_comparative  ## same as page['Geography']['Area - comparative']['text']
+pp page['Geography']['Area - comparative']['text']
 # => "slightly smaller than the US"
-pp page.climate           ## same as page['Geography']['Climate']['text']
+pp page['Geography']['Climate']['text']
 # => "mostly tropical, but temperate in south"
-pp page.terrain           ## same as page['Geography']['Terrain']['text']
+pp page['Geography']['Terrain']['text']
 # => "mostly flat to rolling lowlands in north; ..."
-pp page.elevation_lowest  ## same as page['Geography']['Elevation extremes']['lowest point']['text']
+pp page['Geography']['Elevation extremes']['lowest point']['text']
 # => "Atlantic Ocean 0 m"
-pp page.elevation_highest ## same as page['Geography']['Elevation extremes']['highest point']['text']
+pp page['Geography']['Elevation extremes']['highest point']['text']
 # => "Pico da Neblina 2,994 m"
-pp page.resources         ## same as page['Geography'][Natural resources']['text']
+pp page['Geography']['Natural resources']['text']
 # => "bauxite, gold, iron ore, manganese, nickel, phosphates, ..."
 ...
 ```
-See [`data/attributes.yml`](data/attributes.yml) for the full listing of all built-in attribute shortcut accessors.
-See [Attributes](ATTRIBUTES.md) for a quick reference listing.
+See [Attributes](../ATTRIBUTES.md) for a quick reference listing.
 ### Save to disk as JSON

data/data/codes.csv CHANGED

@@ -216,7 +216,7 @@ sb,Saint Pierre and Miquelon,Dependencies (France),North America
 wf,Wallis and Futuna,Dependencies (France),Australia-Oceania
 aa,Aruba,Dependencies (Netherlands),Central America and Caribbean
 uc,Curacao,Dependencies (Netherlands),Central America and Caribbean
-sk,Sint Maarten,Dependencies (Netherlands),Central America and Caribbean
+nn,Sint Maarten,Dependencies (Netherlands),Central America and Caribbean
 cw,Cook Islands,Dependencies (New Zealand),Australia-Oceania
 ne,Niue,Dependencies (New Zealand),Australia-Oceania
 tl,Tokelau,Dependencies (New Zealand),Australia-Oceania

data/lib/factbook-readers.rb CHANGED

@@ -17,22 +17,21 @@ require 'factbook-readers/version' # let it always go first
 require 'factbook-readers/codes'
 require 'factbook-readers/comparisons'
-require 'factbook-readers/attributes'
-module Factbook
-  ##  auto-load builtin codes, comparisons, attributes, etc.
-  CODES       = Codes.from_csv( "#{Factbook::Module::Readers.root}/data/codes.csv" )
-  COMPARISONS = Comparisons.from_csv( "#{Factbook::Module::Readers.root}/data/comparisons.csv" )
-  ATTRIBUTES  = Attributes.from_yaml( "#{Factbook::Module::Readers.root}/data/attributes.yml" )
-  def self.codes()       CODES; end
-  def self.comparisons() COMPARISONS; end
-  def self.attributes()  ATTRIBUTES; end
+## note: make codes, comparisons available
+module Factbook
+  ##  note: load on demand only builtin codes, comparisons, etc.
+  ##          for now
+  def self.codes
+    @@codes       ||= Codes.read_csv( "#{Factbook::Module::Readers.root}/data/codes.csv" );
+  end
+  def self.comparisons
+    @@comparisons ||= Comparisons.read_csv( "#{Factbook::Module::Readers.root}/data/comparisons.csv" )
+  end
 end # module Factbook
-## note: make codes, comparisons, attributes available
 require 'factbook-readers/utils'
 require 'factbook-readers/utils_info'

data/lib/factbook-readers/builder.rb CHANGED

@@ -1,49 +1,29 @@
-# encoding: utf-8
 module Factbook
-class Builder     ## todo: change to PageBuilder ???
+class Builder     ## todo: change to HtmlBuilder or PageBuilder ???
   include LogUtils::Logging
-=begin
-def self.from_cc( cc, opts={} )  ## rename to from_file_for_country() or from_file_for_cc() or something - why?? why not??
-  ## check/todo: rename input_dir to just dir or to include ?
-  ##   (there's no output_dir)?? - why? why not?
-  input_dir = opts[:input_dir] || '.'
-  self.from_file( "#{input_dir}/#{cc}.html" )
-end
-=end
-def self.from_file( path )
-  html_ascii = File.read( path )    ## fix/todo: use ASCII8BIT/binary reader !!!!!
-  self.from_string( html_ascii )
-end
-def self.from_string( html_ascii )   ## note: expects ASCII-7BIT/BINARY encoding
-  self.new( html_ascii )
-end
-attr_reader :html_ascii,     ## full "original" 1:1 page in "original/ascii8/binary" encoding
-            :html,           ## utf-8 encoded profile
-            :html_debug,     ## html w/ mapping markers - rename to html_markers - why? why not?
+attr_reader :html_original,    ## full "original" 1:1 page
+            :html,             ## cut-out and sanitized profile
+            :html_debug,      ## html w/ mapping markers - rename to html_markers - why? why not?
             :info,            ## page info incl. country_name, region_name, last_updated etc.
             :errors,          ## encoding erros etc.
             :sects
-def initialize( html_ascii )
-  @html_ascii = html_ascii
+def initialize( html_original )
+  @html_original = html_original
-  ## todo/fix: use/assume windows 12xx?? encoding - change encoding to utf-8  (from binary/ascii8bit)
-  @html, @info, @errors = Sanitizer.new.sanitize( @html_ascii )
+  @html, @info, @errors = Sanitizer.new.sanitize( @html_original )
   html_sects =  if @html.empty?
                    ## note: support "empty" pages - old format waiting for update!!!
                    ##    cannot parse for now
+                   @html_debug = ''
                    []  ## return empty (no) sections for now - sorry (its just one page with code cc anyway!!)
                 else
                    @html_debug = map_sects( @html )
@@ -55,7 +35,7 @@ def initialize( html_ascii )
   pp html_sects
   ## debug
-  ## File.open( 'tmp/br.debug.html', 'w:utf-8') { |f| f.write( @html_debug ) }
+  ##   File.open( 'tmp/br.debug.html', 'w:utf-8') { |f| f.write( @html_debug ) }
   @sects = []
@@ -101,25 +81,22 @@ def initialize( html_ascii )
       ## warn/fix:  no section title found
     end
   end
-  self  ## return self -- needed?? default (standard) anyway?? check and remove
 end
+H2_RE = /<h2>
+          \s*
+         (.+?)  ## note: use non-greedy; do NOT allow tags inside for now
+          \s*
+         <\/h2>
+        /xim
 def map_sects( html )
    ## convert section titles to "unified" marker
    ## e.g.
    ##   <h2>Introduction</h2>
-  title_regex= /<h2>
-                 \s*
-                   (.+?)  ## note: use non-greedy; do NOT allow tags inside for now
-                 \s*
-                <\/h2>
-              /xim
-  html = html.gsub( title_regex ) do |m|
+  html = html.gsub( H2_RE ) do |m|
      puts "** found section >#{$1}<:"
      puts "   >|#{m}|<"
@@ -129,19 +106,19 @@ def map_sects( html )
 end
+H3_RE = /<h3>
+          \s*
+         (.+?)                ## note: use non-greedy; allows tags inside - why? why not
+          \s*
+        <\/h3>
+       /xim
 def map_subsects( html )
    ## convert subsection titles to "unified" marker
    ## e.g.
    ##  <h3>Disputes - international:</h3>
-  title_regex= /<h3>
-                  \s*
-                   (.+?)                ## note: use non-greedy; allows tags inside - why? why not
-                  \s*
-                 <\/h3>
-               /xim
-  html = html.gsub( title_regex ) do |m|
+  html = html.gsub( H3_RE ) do |m|
      puts "** found subsection >#{$1}<:"
      puts "   >|#{m}|<"
@@ -163,9 +140,8 @@ def split_sects( html )
   ## note: "wrap" regex in a capture group (just one)
   ##   String#split will include all catpure groups in the result array
-  section_regex= /(@SECTION{.+?})/  ## note: use non-greedy -- check: need to escape {} ??
-  chunks = html.split( section_regex )
+  ## note: use non-greedy -- check: need to escape {} ??
+  chunks = html.split( /(@SECTION{.+?})/ )
   ## check if first item is a section or (html) prolog
   #   if prolog (remove)
@@ -194,9 +170,8 @@ def split_subsects( html )
   ## note: "wrap" regex in a capture group (just one)
   ##   String#split will include all catpure groups in the result array
-  subsection_regex= /(@SUBSECTION{.+?})/  ## note: use non-greedy -- check: need to escape {} ??
-  chunks = html.split( subsection_regex )
+  ## note: use non-greedy -- check: need to escape {} ??
+  chunks = html.split( /(@SUBSECTION{.+?})/ )
   ## check if first item is a section or (html) prolog
   #   if prolog (remove)

data/lib/factbook-readers/builder_json.rb CHANGED

@@ -1,25 +1,14 @@
-# encoding: utf-8
 module Factbook
 ######
 # json builder -- lets us rebuild a page from "dumped" json (instead of parsing html page)
-class JsonBuilder
+class JsonBuilder
   include LogUtils::Logging
   include NormalizeHelper    ##  e.g. normalize_category
-def self.from_file( path )
-  text = File.read( path )     ## fix: use File.read_utf8  from textutils
-  self.from_string( text )
-end
-def self.from_string( text )
-  self.new( text )
-end
 attr_reader :text,
             :json,
             :info,            ## not used yet -- page info incl. country_name, region_name, last_updated etc.
@@ -29,7 +18,7 @@ attr_reader :text,
 def initialize( text )
   @text = text
   @json = JSON.parse( text )
   @info   = nil   ## fix/todo: sorry - for now no page info (use header in json - why? why not??)
@@ -40,16 +29,16 @@ def initialize( text )
   @json.each do |k1,v1|
     sect_title    = k1
     sect_subsects = v1
     sect = Sect.new
     sect.title = sect_title
     ## get subsections
     subsects = []
     sect_subsects.each do |k2,v2|
       subsect_title = k2
       subsect_data  = v2
       subsect = Subsect.new
       subsect.title = subsect_title
@@ -61,13 +50,13 @@ def initialize( text )
           new_subsect_data[ normalize_category(k3) ] = v3
         end
         subsect_data = new_subsect_data
-      end
+      end
       subsect.data  = subsect_data
       subsects << subsect
     end
     sect.subsects = subsects
     @sects << sect
   end

data/lib/factbook-readers/codes.rb CHANGED

@@ -16,7 +16,7 @@ class Codes
                      :region,    ## e.g. Europe, Oceans, etc.
                     )
-  def self.from_csv( path )
+  def self.read_csv( path )
     ###
     #  note:
     #   if you use quotes - NO leading spaces allowed e.g.
@@ -46,9 +46,10 @@ class Codes
       recs << rec
     end
-    self.new( recs )
+    new( recs )
   end
   def initialize( codes )
     @codes = codes
   end

data/lib/factbook-readers/comparisons.rb CHANGED

@@ -9,7 +9,7 @@ class Comparisons
                            :name,
                           )
-  def self.from_csv( path )
+  def self.read_csv( path )
     rows = CsvHash.read( path )
@@ -27,7 +27,7 @@ class Comparisons
       recs << rec
     end
-    self.new( recs )
+    new( recs )
   end
   def initialize( comps )

data/lib/factbook-readers/page.rb CHANGED

@@ -2,28 +2,6 @@
 module Factbook
-## note:
-##   some factbook pages with chrome (headers, footers, etc.)
-##     are NOT valid utf-8, thus,
-##     treat page as is (e.g. ASCII8BIT)
-#
-#   only convert to utf8 when header and footer got stripped
-##
-## be/benin:
-##   Key Force or FC [Lazare S?xx?HOU?xx?TO]     -- two invalid byte code chars in Political parties and leaders:
-#
-##   in Western/Windows-1252  leads to  FC [Lazare SÈHOUÉTO];
-#       Lazare Sèhouéto
-#
-#   looks good - use (assume) Windows-1252 ????
-##
-#   check for is ascii 7-bit ???  if yes -noworries
-#     if not, log number of chars not using ascii 7-bit
 class Page
   include LogUtils::Logging
@@ -35,52 +13,85 @@ class Page
   ## standard version  (note: requires https)
   SITE_BASE = 'https://www.cia.gov/library/publications/the-world-factbook/geos/{code}.html'
-  def initialize( code, opts={} )
-    ### keep code - why? why not??  (use page_info/info e.g. info.country_code??)
-    if opts[:json]
-      json = opts[:json]    ## note: json is (still) a string/text (NOT yet parsed to structured data)
-      b = JsonBuilder.from_string( json )
+  def self.parse( html )   ## parse html from string
+    new( html: html )
+  end
+  def self.read( path )
+    html = File.open( path, 'r:utf-8' ) { |f| f.read }
+    new( html: html )
+  end
+  def self.parse_json( json )  ## parse json from string
+    new( json: json )
+  end
+  def self.read_json( path )
+    json = File.open( path, 'r:utf-8' ) { |f| f.read }
+    new( json: json )
+  end
+  def self.download( code, cache: false )
+    new( code, cache: cache )
+  end
+  ## some convenience alias(es)
+  class << self
+    alias_method :read_html,  :read
+    alias_method :parse_html, :parse
+  end
+  def initialize( code=nil,
+                  json: nil,
+                  html: nil,
+                  cache: false,
+                  info: nil )
+    if json
+       ## note: assumes json is (still) a string/text
+       ##        (NOT yet parsed to structured data)
+      b = JsonBuilder.new( json )
     else  ## assume html
-      if opts[:html]    ## note: expects ASCII-7BIT/BINARY encoding
-         ## for debugging and testing allow "custom" passed-in html page
-        html = opts[:html]
+      if html
+        ## for debugging and testing allow "custom" passed-in html page
       else
-        url_string =  SITE_BASE.gsub( '{code}', code )
-        ## note: expects ASCII-7BIT/BINARY encoding
+        ## allow passing in code struct too - just use/pluck two-letter code from struct !!!
+        code = code.code   if code.is_a?( Codes::Code )
-        ## html = fetch_page( url_string )   ## use PageFetcher class - why?? why not??
-        html = Webcache.read( url_string )
+        raise ArgumentError, "two letter code (e.g. au) required to download page & build page url"   if code.nil?
+        url = SITE_BASE.sub( '{code}', code )
+        html = if cache && Webcache.exist?( url )
+                   Webcache.read( url )  ## for debugging - read from cache
+               else
+                   download_page( url )
+               end
       end
-      b = Builder.from_string( html )
+      b = Builder.new( html )
     end
     @sects = b.sects
     @info  = b.info
     ## todo/fix/quick hack:
-    ##  check for info opts hash entry - lets you overwrite page info
+    ##  check for info opts - lets you overwrite page info
     ##  -- use proper header to setup page info - why, why not??
-    if opts[:info]
-      info  = opts[:info]
-      @info = info
-    end
+    @info = info    if info
     @data = {}
     @sects.each do |sect|
       @data[ sect.title ] = sect.data
     end
-    self  ## return self (check - not needed??)
   end
-  def to_json( opts={} )  ## convenience helper for data.to_json; note: pretty print by default!
-    if opts[:minify]
+  def to_json( minify: false )  ## convenience helper for data.to_json; note: pretty print by default!
+    if minify
       data.to_json
-    else
-      ## was: -- opts[:pretty] || opts[:pp]
-      JSON.pretty_generate( data )   ## note: pretty print by default!
+    else ## note: pretty print by default!
+      JSON.pretty_generate( data )
     end
   end
@@ -96,30 +107,9 @@ class Page
     data[key]
   end
-  ## add convenience (shortcut) accessors / attributes / fields / getters
-  ATTRIBUTES.each do |attrib|
-    ## e.g.
-    ##    def background()  data['Introduction']['Background']['text']; end
-    ##    def location()    data['Geography']['Location']['text'];      end
-    ##    etc.
-    if attrib.path.size == 1
-      define_method attrib.name.to_sym do
-        @data.fetch( attrib.category, {} ).
-              fetch( attrib.path[0], {} )['text']
-      end
-    else  ## assume size 2 for now
-      define_method attrib.name.to_sym do
-        @data.fetch( attrib.category, {} ).
-              fetch( attrib.path[0], {} ).
-              fetch( attrib.path[1], {} )['text']
-      end
-    end
-  end
 private
-  def fetch_page( url )
+  def download_page( url )
     response = Webget.page( url )
     ## note: exit on get / fetch error - do NOT continue for now - why? why not?
@@ -128,21 +118,5 @@ private
     response.text
   end
-=begin
-def self.from_url( cc, cn )
-  html_ascii = PageFetcher.new.fetch( cc )
-  self.new( cc, cn, html_ascii )
-end
-def self.from_file( cc, cn, opts={} )
-  input_dir = opts[:input_dir] || '.'
-  html_ascii = File.read( "#{input_dir}/#{cc}.html" )    ## fix/todo: use ASCII8BIT/binary reader
-  self.new( cc, cn, html_ascii )
-end
-=end
 end # class Page
 end # module Factbook