RubyGems - factbook-readers - Versions diffs - 0.0.1 → 1.0.0 - Mend

factbook-readers 0.0.1 → 1.0.0

Files changed (42) hide show

checksums.yaml +4 -4
data/Manifest.txt +0 -16
data/README.md +13 -14
data/data/codes.csv +1 -1
data/lib/factbook-readers.rb +11 -12
data/lib/factbook-readers/builder.rb +28 -53
data/lib/factbook-readers/builder_json.rb +9 -20
data/lib/factbook-readers/codes.rb +3 -2
data/lib/factbook-readers/comparisons.rb +2 -2
data/lib/factbook-readers/page.rb +59 -85
data/lib/factbook-readers/sanitizer.rb +13 -34
data/lib/factbook-readers/version.rb +2 -2
data/test/helper.rb +1 -5
data/test/test_builder.rb +1 -6
data/test/test_codes.rb +5 -9
data/test/test_comparisons.rb +2 -5
data/test/test_counter.rb +4 -6
data/test/test_fields.rb +0 -2
data/test/test_item_builder.rb +7 -9
data/test/test_json.rb +1 -3
data/test/test_json_builder.rb +1 -3
data/test/test_normalize.rb +0 -2
data/test/test_page.rb +2 -4
data/test/test_sanitizer.rb +2 -5
data/test/test_sanitizer_regex.rb +0 -2
metadata +2 -18
data/data/attributes.yml +0 -337
data/lib/factbook-readers/attributes.rb +0 -74
data/test/data/au.html +0 -579
data/test/data/au.yml +0 -8
data/test/data/be.html +0 -596
data/test/data/be.yml +0 -8
data/test/data/json/au.json +0 -892
data/test/data/src/ag.html +0 -716
data/test/data/src/au-2015-09-24.html +0 -2006
data/test/data/src/au.html +0 -658
data/test/data/src/be-2015-09-24.html +0 -2011
data/test/data/src/be.html +0 -648
data/test/test_attribs.rb +0 -87
data/test/test_attribs_def.rb +0 -20
data/test/test_convert.rb +0 -30
data/test/test_importer.rb +0 -56

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: dc307d415f957d373118337b72baa5ca9c0b5686
-  data.tar.gz: f5241909514a895878e29b1e9e3dd0f3fddf9088
+  metadata.gz: d9bc3eaf2cb6fa3774e0b7a25b53336be2b05a55
+  data.tar.gz: 86565dc92913645110beec803d7bc0a7c088155f
 SHA512:
-  metadata.gz: cd89c3f31089bb3256969076a69fe2dcc3e04f762033cec14e621d5bbff289f72aecd4f6a67e0c32b864588c56f3a57e8059f9957f885293875a95695dd22059
-  data.tar.gz: 5f55a31397cbfa186cb85597ec280a2fc94fe63e8b595ccf18e36e2168dff531e19d0a64a63281fba380356df23b56769318aecf1d84c52277e511e5cb21998f
+  metadata.gz: 755b8727d0bbcaecd97f52064b1b29321e1b59a72bff55bbdd995ed8968732def7480f4cab0f222bf4c9d163afbd5230647237b96d41dc136006f0f9a9473550
+  data.tar.gz: 638dcf4f4a552c705a743c9e7483e457303d2090a9204ee3d4b390c3256b537050ea4fdc0957cba461dcee549ecf52b541c4b691dcb8c500c4439eaf376d4a87

data/Manifest.txt CHANGED

@@ -2,13 +2,11 @@ CHANGELOG.md
 Manifest.txt
 README.md
 Rakefile
-data/attributes.yml
 data/categories.csv
 data/codes.csv
 data/codesxref.csv
 data/comparisons.csv
 lib/factbook-readers.rb
-lib/factbook-readers/attributes.rb
 lib/factbook-readers/builder.rb
 lib/factbook-readers/builder_item.rb
 lib/factbook-readers/builder_json.rb
@@ -27,26 +25,12 @@ lib/factbook-readers/utils.rb
 lib/factbook-readers/utils_info.rb
 lib/factbook-readers/version.rb
 lib/factbook/readers.rb
-test/data/au.html
-test/data/au.yml
-test/data/be.html
-test/data/be.yml
-test/data/json/au.json
-test/data/src/ag.html
-test/data/src/au-2015-09-24.html
-test/data/src/au.html
-test/data/src/be-2015-09-24.html
-test/data/src/be.html
 test/helper.rb
-test/test_attribs.rb
-test/test_attribs_def.rb
 test/test_builder.rb
 test/test_codes.rb
 test/test_comparisons.rb
-test/test_convert.rb
 test/test_counter.rb
 test/test_fields.rb
-test/test_importer.rb
 test/test_item_builder.rb
 test/test_json.rb
 test/test_json_builder.rb

data/README.md CHANGED

@@ -55,36 +55,35 @@ resulting in:
     ...
 ```
-### Use shortcut attribute accessors
+### Use data attributes
 ```ruby
-pp page.background        ## same as page['Introduction']['Background']['text']
+pp page['Introduction']['Background']['text']
 # => "Following more than three centuries..."
-pp page.area              ## same as page['Geography'][''Area']['total']['text']
+pp page['Geography']['Area']['total']['text']
 # => "8,515,770 sq km"
-pp page.area_land         ## same as page['Geography'][''Area']['land']['text']
+pp page['Geography']['Area']['land']['text']
 # => "8,358,140 sq km"
-pp page.area_water        ## same as page['Geography'][''Area']['water']['text']
+pp page['Geography']['Area']['water']['text']
 # => "157,630 sq km"
-pp page.area_note         ## same as page['Geography'][''Area']['note']['text']
+pp page['Geography']['Area']['note']['text']
 # => "includes Arquipelago de Fernando de Noronha, Atol das Rocas, ..."
-pp page.area_comparative  ## same as page['Geography']['Area - comparative']['text']
+pp page['Geography']['Area - comparative']['text']
 # => "slightly smaller than the US"
-pp page.climate           ## same as page['Geography']['Climate']['text']
+pp page['Geography']['Climate']['text']
 # => "mostly tropical, but temperate in south"
-pp page.terrain           ## same as page['Geography']['Terrain']['text']
+pp page['Geography']['Terrain']['text']
 # => "mostly flat to rolling lowlands in north; ..."
-pp page.elevation_lowest  ## same as page['Geography']['Elevation extremes']['lowest point']['text']
+pp page['Geography']['Elevation extremes']['lowest point']['text']
 # => "Atlantic Ocean 0 m"
-pp page.elevation_highest ## same as page['Geography']['Elevation extremes']['highest point']['text']
+pp page['Geography']['Elevation extremes']['highest point']['text']
 # => "Pico da Neblina 2,994 m"
-pp page.resources         ## same as page['Geography'][Natural resources']['text']
+pp page['Geography']['Natural resources']['text']
 # => "bauxite, gold, iron ore, manganese, nickel, phosphates, ..."
 ...
 ```
-See [`data/attributes.yml`](data/attributes.yml) for the full listing of all built-in attribute shortcut accessors.
-See [Attributes](ATTRIBUTES.md) for a quick reference listing.
+See [Attributes](../ATTRIBUTES.md) for a quick reference listing.
 ### Save to disk as JSON

data/data/codes.csv CHANGED

@@ -216,7 +216,7 @@ sb,Saint Pierre and Miquelon,Dependencies (France),North America
 wf,Wallis and Futuna,Dependencies (France),Australia-Oceania
 aa,Aruba,Dependencies (Netherlands),Central America and Caribbean
 uc,Curacao,Dependencies (Netherlands),Central America and Caribbean
-sk,Sint Maarten,Dependencies (Netherlands),Central America and Caribbean
+nn,Sint Maarten,Dependencies (Netherlands),Central America and Caribbean
 cw,Cook Islands,Dependencies (New Zealand),Australia-Oceania
 ne,Niue,Dependencies (New Zealand),Australia-Oceania
 tl,Tokelau,Dependencies (New Zealand),Australia-Oceania

data/lib/factbook-readers.rb CHANGED

@@ -17,22 +17,21 @@ require 'factbook-readers/version' # let it always go first
 require 'factbook-readers/codes'
 require 'factbook-readers/comparisons'
-require 'factbook-readers/attributes'
-module Factbook
-  ##  auto-load builtin codes, comparisons, attributes, etc.
-  CODES       = Codes.from_csv( "#{Factbook::Module::Readers.root}/data/codes.csv" )
-  COMPARISONS = Comparisons.from_csv( "#{Factbook::Module::Readers.root}/data/comparisons.csv" )
-  ATTRIBUTES  = Attributes.from_yaml( "#{Factbook::Module::Readers.root}/data/attributes.yml" )
-  def self.codes()       CODES; end
-  def self.comparisons() COMPARISONS; end
-  def self.attributes()  ATTRIBUTES; end
+## note: make codes, comparisons available
+module Factbook
+  ##  note: load on demand only builtin codes, comparisons, etc.
+  ##          for now
+  def self.codes
+    @@codes       ||= Codes.read_csv( "#{Factbook::Module::Readers.root}/data/codes.csv" );
+  end
+  def self.comparisons
+    @@comparisons ||= Comparisons.read_csv( "#{Factbook::Module::Readers.root}/data/comparisons.csv" )
+  end
 end # module Factbook
-## note: make codes, comparisons, attributes available
 require 'factbook-readers/utils'
 require 'factbook-readers/utils_info'

data/lib/factbook-readers/builder.rb CHANGED

@@ -1,49 +1,29 @@
-# encoding: utf-8
 module Factbook
-class Builder     ## todo: change to PageBuilder ???
+class Builder     ## todo: change to HtmlBuilder or PageBuilder ???
   include LogUtils::Logging
-=begin
-def self.from_cc( cc, opts={} )  ## rename to from_file_for_country() or from_file_for_cc() or something - why?? why not??
-  ## check/todo: rename input_dir to just dir or to include ?
-  ##   (there's no output_dir)?? - why? why not?
-  input_dir = opts[:input_dir] || '.'
-  self.from_file( "#{input_dir}/#{cc}.html" )
-end
-=end
-def self.from_file( path )
-  html_ascii = File.read( path )    ## fix/todo: use ASCII8BIT/binary reader !!!!!
-  self.from_string( html_ascii )
-end
-def self.from_string( html_ascii )   ## note: expects ASCII-7BIT/BINARY encoding
-  self.new( html_ascii )
-end
-attr_reader :html_ascii,     ## full "original" 1:1 page in "original/ascii8/binary" encoding
-            :html,           ## utf-8 encoded profile
-            :html_debug,     ## html w/ mapping markers - rename to html_markers - why? why not?
+attr_reader :html_original,    ## full "original" 1:1 page
+            :html,             ## cut-out and sanitized profile
+            :html_debug,      ## html w/ mapping markers - rename to html_markers - why? why not?
             :info,            ## page info incl. country_name, region_name, last_updated etc.
             :errors,          ## encoding erros etc.
             :sects
-def initialize( html_ascii )
-  @html_ascii = html_ascii
+def initialize( html_original )
+  @html_original = html_original
-  ## todo/fix: use/assume windows 12xx?? encoding - change encoding to utf-8  (from binary/ascii8bit)
-  @html, @info, @errors = Sanitizer.new.sanitize( @html_ascii )
+  @html, @info, @errors = Sanitizer.new.sanitize( @html_original )
   html_sects =  if @html.empty?
                    ## note: support "empty" pages - old format waiting for update!!!
                    ##    cannot parse for now
+                   @html_debug = ''
                    []  ## return empty (no) sections for now - sorry (its just one page with code cc anyway!!)
                 else
                    @html_debug = map_sects( @html )
@@ -55,7 +35,7 @@ def initialize( html_ascii )
   pp html_sects
   ## debug
-  ## File.open( 'tmp/br.debug.html', 'w:utf-8') { |f| f.write( @html_debug ) }
+  ##   File.open( 'tmp/br.debug.html', 'w:utf-8') { |f| f.write( @html_debug ) }
   @sects = []
@@ -101,25 +81,22 @@ def initialize( html_ascii )
       ## warn/fix:  no section title found
     end
   end
-  self  ## return self -- needed?? default (standard) anyway?? check and remove
 end
+H2_RE = /<h2>
+          \s*
+         (.+?)  ## note: use non-greedy; do NOT allow tags inside for now
+          \s*
+         <\/h2>
+        /xim
 def map_sects( html )
    ## convert section titles to "unified" marker
    ## e.g.
    ##   <h2>Introduction</h2>
-  title_regex= /<h2>
-                 \s*
-                   (.+?)  ## note: use non-greedy; do NOT allow tags inside for now
-                 \s*
-                <\/h2>
-              /xim
-  html = html.gsub( title_regex ) do |m|
+  html = html.gsub( H2_RE ) do |m|
      puts "** found section >#{$1}<:"
      puts "   >|#{m}|<"
@@ -129,19 +106,19 @@ def map_sects( html )
 end
+H3_RE = /<h3>
+          \s*
+         (.+?)                ## note: use non-greedy; allows tags inside - why? why not
+          \s*
+        <\/h3>
+       /xim
 def map_subsects( html )
    ## convert subsection titles to "unified" marker
    ## e.g.
    ##  <h3>Disputes - international:</h3>
-  title_regex= /<h3>
-                  \s*
-                   (.+?)                ## note: use non-greedy; allows tags inside - why? why not
-                  \s*
-                 <\/h3>
-               /xim
-  html = html.gsub( title_regex ) do |m|
+  html = html.gsub( H3_RE ) do |m|
      puts "** found subsection >#{$1}<:"
      puts "   >|#{m}|<"
@@ -163,9 +140,8 @@ def split_sects( html )
   ## note: "wrap" regex in a capture group (just one)
   ##   String#split will include all catpure groups in the result array
-  section_regex= /(@SECTION{.+?})/  ## note: use non-greedy -- check: need to escape {} ??
-  chunks = html.split( section_regex )
+  ## note: use non-greedy -- check: need to escape {} ??
+  chunks = html.split( /(@SECTION{.+?})/ )
   ## check if first item is a section or (html) prolog
   #   if prolog (remove)
@@ -194,9 +170,8 @@ def split_subsects( html )
   ## note: "wrap" regex in a capture group (just one)
   ##   String#split will include all catpure groups in the result array
-  subsection_regex= /(@SUBSECTION{.+?})/  ## note: use non-greedy -- check: need to escape {} ??
-  chunks = html.split( subsection_regex )
+  ## note: use non-greedy -- check: need to escape {} ??
+  chunks = html.split( /(@SUBSECTION{.+?})/ )
   ## check if first item is a section or (html) prolog
   #   if prolog (remove)

data/lib/factbook-readers/builder_json.rb CHANGED

@@ -1,25 +1,14 @@
-# encoding: utf-8
 module Factbook
 ######
 # json builder -- lets us rebuild a page from "dumped" json (instead of parsing html page)
-class JsonBuilder
+class JsonBuilder
   include LogUtils::Logging
   include NormalizeHelper    ##  e.g. normalize_category
-def self.from_file( path )
-  text = File.read( path )     ## fix: use File.read_utf8  from textutils
-  self.from_string( text )
-end
-def self.from_string( text )
-  self.new( text )
-end
 attr_reader :text,
             :json,
             :info,            ## not used yet -- page info incl. country_name, region_name, last_updated etc.
@@ -29,7 +18,7 @@ attr_reader :text,
 def initialize( text )
   @text = text
   @json = JSON.parse( text )
   @info   = nil   ## fix/todo: sorry - for now no page info (use header in json - why? why not??)
@@ -40,16 +29,16 @@ def initialize( text )
   @json.each do |k1,v1|
     sect_title    = k1
     sect_subsects = v1
     sect = Sect.new
     sect.title = sect_title
     ## get subsections
     subsects = []
     sect_subsects.each do |k2,v2|
       subsect_title = k2
       subsect_data  = v2
       subsect = Subsect.new
       subsect.title = subsect_title
@@ -61,13 +50,13 @@ def initialize( text )
           new_subsect_data[ normalize_category(k3) ] = v3
         end
         subsect_data = new_subsect_data
-      end
+      end
       subsect.data  = subsect_data
       subsects << subsect
     end
     sect.subsects = subsects
     @sects << sect
   end

data/lib/factbook-readers/codes.rb CHANGED

@@ -16,7 +16,7 @@ class Codes
                      :region,    ## e.g. Europe, Oceans, etc.
                     )
-  def self.from_csv( path )
+  def self.read_csv( path )
     ###
     #  note:
     #   if you use quotes - NO leading spaces allowed e.g.
@@ -46,9 +46,10 @@ class Codes
       recs << rec
     end
-    self.new( recs )
+    new( recs )
   end
   def initialize( codes )
     @codes = codes
   end

data/lib/factbook-readers/comparisons.rb CHANGED

@@ -9,7 +9,7 @@ class Comparisons
                            :name,
                           )
-  def self.from_csv( path )
+  def self.read_csv( path )
     rows = CsvHash.read( path )
@@ -27,7 +27,7 @@ class Comparisons
       recs << rec
     end
-    self.new( recs )
+    new( recs )
   end
   def initialize( comps )

data/lib/factbook-readers/page.rb CHANGED

@@ -2,28 +2,6 @@
 module Factbook
-## note:
-##   some factbook pages with chrome (headers, footers, etc.)
-##     are NOT valid utf-8, thus,
-##     treat page as is (e.g. ASCII8BIT)
-#
-#   only convert to utf8 when header and footer got stripped
-##
-## be/benin:
-##   Key Force or FC [Lazare S?xx?HOU?xx?TO]     -- two invalid byte code chars in Political parties and leaders:
-#
-##   in Western/Windows-1252  leads to  FC [Lazare SÈHOUÉTO];
-#       Lazare Sèhouéto
-#
-#   looks good - use (assume) Windows-1252 ????
-##
-#   check for is ascii 7-bit ???  if yes -noworries
-#     if not, log number of chars not using ascii 7-bit
 class Page
   include LogUtils::Logging
@@ -35,52 +13,85 @@ class Page
   ## standard version  (note: requires https)
   SITE_BASE = 'https://www.cia.gov/library/publications/the-world-factbook/geos/{code}.html'
-  def initialize( code, opts={} )
-    ### keep code - why? why not??  (use page_info/info e.g. info.country_code??)
-    if opts[:json]
-      json = opts[:json]    ## note: json is (still) a string/text (NOT yet parsed to structured data)
-      b = JsonBuilder.from_string( json )
+  def self.parse( html )   ## parse html from string
+    new( html: html )
+  end
+  def self.read( path )
+    html = File.open( path, 'r:utf-8' ) { |f| f.read }
+    new( html: html )
+  end
+  def self.parse_json( json )  ## parse json from string
+    new( json: json )
+  end
+  def self.read_json( path )
+    json = File.open( path, 'r:utf-8' ) { |f| f.read }
+    new( json: json )
+  end
+  def self.download( code, cache: false )
+    new( code, cache: cache )
+  end
+  ## some convenience alias(es)
+  class << self
+    alias_method :read_html,  :read
+    alias_method :parse_html, :parse
+  end
+  def initialize( code=nil,
+                  json: nil,
+                  html: nil,
+                  cache: false,
+                  info: nil )
+    if json
+       ## note: assumes json is (still) a string/text
+       ##        (NOT yet parsed to structured data)
+      b = JsonBuilder.new( json )
     else  ## assume html
-      if opts[:html]    ## note: expects ASCII-7BIT/BINARY encoding
-         ## for debugging and testing allow "custom" passed-in html page
-        html = opts[:html]
+      if html
+        ## for debugging and testing allow "custom" passed-in html page
       else
-        url_string =  SITE_BASE.gsub( '{code}', code )
-        ## note: expects ASCII-7BIT/BINARY encoding
+        ## allow passing in code struct too - just use/pluck two-letter code from struct !!!
+        code = code.code   if code.is_a?( Codes::Code )
-        ## html = fetch_page( url_string )   ## use PageFetcher class - why?? why not??
-        html = Webcache.read( url_string )
+        raise ArgumentError, "two letter code (e.g. au) required to download page & build page url"   if code.nil?
+        url = SITE_BASE.sub( '{code}', code )
+        html = if cache && Webcache.exist?( url )
+                   Webcache.read( url )  ## for debugging - read from cache
+               else
+                   download_page( url )
+               end
       end
-      b = Builder.from_string( html )
+      b = Builder.new( html )
     end
     @sects = b.sects
     @info  = b.info
     ## todo/fix/quick hack:
-    ##  check for info opts hash entry - lets you overwrite page info
+    ##  check for info opts - lets you overwrite page info
     ##  -- use proper header to setup page info - why, why not??
-    if opts[:info]
-      info  = opts[:info]
-      @info = info
-    end
+    @info = info    if info
     @data = {}
     @sects.each do |sect|
       @data[ sect.title ] = sect.data
     end
-    self  ## return self (check - not needed??)
   end
-  def to_json( opts={} )  ## convenience helper for data.to_json; note: pretty print by default!
-    if opts[:minify]
+  def to_json( minify: false )  ## convenience helper for data.to_json; note: pretty print by default!
+    if minify
       data.to_json
-    else
-      ## was: -- opts[:pretty] || opts[:pp]
-      JSON.pretty_generate( data )   ## note: pretty print by default!
+    else ## note: pretty print by default!
+      JSON.pretty_generate( data )
     end
   end
@@ -96,30 +107,9 @@ class Page
     data[key]
   end
-  ## add convenience (shortcut) accessors / attributes / fields / getters
-  ATTRIBUTES.each do |attrib|
-    ## e.g.
-    ##    def background()  data['Introduction']['Background']['text']; end
-    ##    def location()    data['Geography']['Location']['text'];      end
-    ##    etc.
-    if attrib.path.size == 1
-      define_method attrib.name.to_sym do
-        @data.fetch( attrib.category, {} ).
-              fetch( attrib.path[0], {} )['text']
-      end
-    else  ## assume size 2 for now
-      define_method attrib.name.to_sym do
-        @data.fetch( attrib.category, {} ).
-              fetch( attrib.path[0], {} ).
-              fetch( attrib.path[1], {} )['text']
-      end
-    end
-  end
 private
-  def fetch_page( url )
+  def download_page( url )
     response = Webget.page( url )
     ## note: exit on get / fetch error - do NOT continue for now - why? why not?
@@ -128,21 +118,5 @@ private
     response.text
   end
-=begin
-def self.from_url( cc, cn )
-  html_ascii = PageFetcher.new.fetch( cc )
-  self.new( cc, cn, html_ascii )
-end
-def self.from_file( cc, cn, opts={} )
-  input_dir = opts[:input_dir] || '.'
-  html_ascii = File.read( "#{input_dir}/#{cc}.html" )    ## fix/todo: use ASCII8BIT/binary reader
-  self.new( cc, cn, html_ascii )
-end
-=end
 end # class Page
 end # module Factbook