RubyGems - factbook - Versions diffs - 0.1.3 → 1.0.0 - Mend

factbook 0.1.3 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (46) hide show

checksums.yaml +7 -0
data/Manifest.txt +34 -22
data/README.md +8 -3
data/Rakefile +2 -263
data/data/codes.csv +262 -0
data/data/comparisons.csv +75 -0
data/lib/factbook/builder.rb +214 -0
data/lib/factbook/builder_item.rb +93 -0
data/lib/factbook/codes.rb +119 -0
data/lib/factbook/comparisons.rb +50 -0
data/lib/factbook/page.rb +103 -303
data/lib/factbook/sanitizer.rb +214 -0
data/lib/factbook/sect.rb +29 -196
data/lib/factbook/subsect.rb +18 -0
data/lib/factbook/table.rb +52 -0
data/lib/factbook/utils.rb +85 -0
data/lib/factbook/utils_info.rb +102 -0
data/lib/factbook/version.rb +4 -3
data/lib/factbook.rb +23 -1
data/test/data/au.html +579 -0
data/test/data/au.yml +8 -0
data/test/data/be.html +596 -0
data/test/data/be.yml +8 -0
data/test/data/src/au.html +2006 -0
data/test/data/src/be.html +2011 -0
data/test/helper.rb +0 -4
data/test/test_builder.rb +37 -0
data/test/test_codes.rb +76 -0
data/test/test_comparisons.rb +19 -0
data/test/test_fields.rb +21 -18
data/test/test_item_builder.rb +99 -0
data/test/test_json.rb +17 -20
data/test/test_page.rb +18 -10
data/test/test_sanitizer.rb +35 -0
metadata +68 -49
data/.gemtest +0 -0
data/test/data/countrytemplate_au.html +0 -4179
data/test/data/countrytemplate_be.html +0 -4260
data/test/data/countrytemplate_br.html +0 -4366
data/test/data/countrytemplate_ee.html +0 -2999
data/test/data/countrytemplate_ls.html +0 -2728
data/test/data/countrytemplate_mx.html +0 -4397
data/test/data/countrytemplate_vt.html +0 -1726
data/test/data/countrytemplate_xx.html +0 -2898
data/test/test_page_old.rb +0 -478
data/test/test_strip.rb +0 -66

data/lib/factbook/page.rb CHANGED Viewed

@@ -1,303 +1,103 @@
-# encoding: utf-8
-module Factbook
-  class Page
-    include LogUtils::Logging
-    ## standard version
-    ## SITE_BASE = 'https://www.cia.gov/library/publications/the-world-factbook/geos/{code}.html'
-    ## -- use text (low-bandwidth) version
-    ## e.g. www.cia.gov/library/publications/the-world-factbook/geos/countrytemplate_br.html
-    SITE_BASE = 'https://www.cia.gov/library/publications/the-world-factbook/geos/countrytemplate_{code}.html'
-    def initialize( code, opts={} )
-      ## note: requires factbook country code
-      #   e.g. austria is au
-      #        germany is gm  and so on
-      @code  = code
-      ### rename fields to format option?? why? why not? e.g. :format => 'long' ??
-      @opts  = opts   # fields:  full|long|keep|std|??  -- find a good name for the option keeping field names as is
-      @html  = nil
-      @doc   = nil
-      @sects = nil
-      @data  = nil
-    end
-    def doc
-      @doc ||= Nokogiri::HTML( html )
-    end
-    def to_json( opts={} )
-      ## convenience helper for data.to_json
-      if opts[:pretty] || opts[:pp]
-        JSON.pretty_generate( data )
-      else
-        data.to_json
-      end
-    end
-    def [](key)  ### convenience shortcut
-      # lets you use
-      #   page['geo']
-      #   instead of
-      #   page.data['geo']
-      ##  fix: use delegate data, [] from forwardable lib - why?? why not??
-      data[key]
-    end
-    def data
-      if @data.nil?
-        @data = {}
-        if @opts[:header]   ## include (leading) header section ??
-          header_key =     @opts[:fields] ? 'Header' : 'header'
-          last_built_key = @opts[:fields] ? 'last built' : 'last_built'
-          @data[header_key] = {
-            'code' => @code,
-            'generator' => "factbook/#{VERSION}",
-            last_built_key => "#{Time.now}",
-          }
-        end
-        sects.each_with_index do |sect,i|
-          logger.debug "############################"
-          logger.debug "###  [#{i}] stats sect >#{sect.title}<: "
-          @data[ sect.title ] = sect.data
-        end
-      end
-      @data
-    end
-    def sects
-      if @sects.nil?
-        ## split html into sections
-        ##   lets us avoids errors w/ (wrongly) nested tags
-        ## check opts for using long or short category/field names
-        divs = [
-          [ @opts[:fields] ? 'Introduction'        : 'intro',    '<div id="CollapsiblePanel1_Intro"'   ],
-          [ @opts[:fields] ? 'Geography'           : 'geo',      '<div id="CollapsiblePanel1_Geo"'     ],
-          [ @opts[:fields] ? 'People and Society'  : 'people',   '<div id="CollapsiblePanel1_People"'  ],
-          [ @opts[:fields] ? 'Government'          : 'govt',     '<div id="CollapsiblePanel1_Govt"'    ],
-          [ @opts[:fields] ? 'Economy'             : 'econ',     '<div id="CollapsiblePanel1_Econ"'    ],
-          [ @opts[:fields] ? 'Energy'              : 'energy',   '<div id="CollapsiblePanel1_Energy"'  ],
-          [ @opts[:fields] ? 'Communications'      : 'comm',     '<div id="CollapsiblePanel1_Comm"'    ],
-          [ @opts[:fields] ? 'Transportation'      : 'trans',    '<div id="CollapsiblePanel1_Trans"'   ],
-          [ @opts[:fields] ? 'Military'            : 'military', '<div id="CollapsiblePanel1_Military"'],
-          [ @opts[:fields] ? 'Transnational Issues': 'issues',   '<div id="CollapsiblePanel1_Issues"'  ]
-        ]
-        indexes = []
-        ## note:
-        ##   skip missing sections (w/ warning)
-        ##   e.g. Vatican (Holy See), Liechtenstein etc. have no Energy section, for example
-        divs.each_with_index do |rec,i|
-          title = rec[0]
-          div   = rec[1]
-          p = html.index( div )
-          if p.nil?
-            ## issue warning: if not found
-            logger.warn "***!!! section not found -- #{div} --; skipping"
-          else
-            logger.debug "  found section #{i} @ #{p}"
-            indexes <<  [title,p]
-          end
-        end
-        @sects = []
-        indexes.each_with_index do |rec,i|
-          title = rec[0]
-          from  = rec[1]
-          # is last entry? if yes use -1 otherewise pos
-          #   note: subtract one (-1) from pos unless end-of-string (-1)
-          to    = indexes[i+1].nil? ? -1 : indexes[i+1][1]-1
-          ## todo: check that from is smaller than to
-          logger.debug "   cut section #{i} [#{from}..#{to}]"
-          @sects << Sect.new( title, html[ from..to ], @opts )
-          ##if i==0 || i==1
-            ## puts "debug sect #{i}:"
-            ## puts ">>>|||#{html[ from..to ]}|||<<<"
-          ##end
-        end
-      end
-      @sects
-    end
-    def html=(html)
-      ## for debugging n testing
-      ## lets you set html (no need to fetch via net)
-      @html = html
-    end
-    def html
-      if @html.nil?
-        @html = fetch()
-      ### remove everything up to
-      ##   <div id="countryInfo" style="display: none;">
-      ## remove everything starting w/ footer
-      ## remove head !!!
-      ## in body remove header n footer
-        ## remove inline script
-        @html = @html.gsub( /<script[^>]*>.*?<\/script>/m ) do |m|
-          puts "remove script:"
-          puts "#{m}"
-          ''
-        end
-        ## remove inline style
-        @html = @html.gsub( /<style[^>]*>.*?<\/style>/m ) do |m|
-          puts "remove style:"
-          puts "#{m}"
-          ''
-        end
-        ## remove link
-        link_regex = /<link[^>]+>/
-        @html = @html.gsub( link_regex ) do |m|
-          puts "remove link:"
-          puts "#{m}"
-          ''
-        end
-        div_country_info_regex = /<div id="countryInfo"\s*>/
-        ## remove everything before <div id="countryInfo" >
-        pos = @html.index( div_country_info_regex )
-        if pos  # not nil, false
-          @html = @html[pos..-1]
-        end
-        ## remove country comparison
-        ## e.g.  <span class="category" >country comparison to the world:</span>
-        ##       <span class="category_data">
-        ##  <a href="../rankorder/2147rank.html?countryname=Brazil&countrycode=br&regionCode=soa&rank=5#br" onMouseDown=""  title="Country comparison to the world" alt="Country comparison to the world">
-        ##    5
-        ##  </a>
-        ##  </span>
-        ##
-        ##
-        ## <span class="category" style="padding-left:7px;">country comparison to the world:</span> <span class="category_data">
-        ##  <a href="../rankorder/2147rank.html?countryname=Brazil&countrycode=br&regionCode=soa&rank=5#br" onMouseDown=""  title="Country comparison to the world" alt="Country comparison to the world"> 5 </a> </span>
-        ##
-        country_comparison_regex = /
-         <span \s class="category"[^>]*>
-           country \s comparison \s to \s the \s world:
-         <\/span>
-          \s*
-         <span \s class="category_data"[^>]*>
-          \s*
-            <a \s [^>]+>
-             .+?
-            <\/a>
-          \s*
-         <\/span>
-        /xm
-        @html = @html.gsub( country_comparison_regex ) do |m|
-          puts "remove country comparison:"
-          puts "#{m}"
-          ''
-        end
-        style_attr_regex = /\s*style="[^"]+"/
-        @html = @html.gsub( style_attr_regex ) do |m|
-          puts "remove style attr:"
-          puts "#{m}"
-          ''
-        end
-        ## <tr height="22">
-        ##   <td class="category_data"></td>
-        ##   </tr>
-        tr_empty_regex = /
-           <tr[^>]*>
-             \s*
-              <td[^>]*> \s* <\/td>
-             \s*
-           <\/tr>
-        /xm
-        @html = @html.gsub( tr_empty_regex ) do |m|
-          puts "remove tr emtpy:"
-          puts "#{m}"
-          ''
-        end
-        ##  remove world leader website promo
-        ##  <span class="category">(For more information visit the
-        ##     <a href="/library/publications/world-leaders-1/index.html" target="_blank">World Leaders website</a>&nbsp;
-        ##       <img src="../graphics/soa_newwindow.gif" alt="Opens in New Window" title="Opens in New Window" border="0"/>)
-        ##  </span>
-        world_leaders_website_regex = /
-         <span \s class="category"[^>]*>
-           \(
-           For \s more \s information \s
-            .+?       ## non-greedy (smallest possible match
-           \)
-         <\/span>
-        /xm
-        @html = @html.gsub( world_leaders_website_regex ) do |m|
-          puts "remove world leader website promo:"
-          puts "#{m}"
-          ''
-        end
-      end
-      @html
-    end
-  private
-    def fetch
-      uri_string = SITE_BASE.gsub( '{code}', @code )
-      worker = Fetcher::Worker.new
-      response = worker.get_response( uri_string )
-      if response.code == '200'
-        t = response.body
-        ###
-        # NB: Net::HTTP will NOT set encoding UTF-8 etc.
-        # will mostly be ASCII
-        # - try to change encoding to UTF-8 ourselves
-        logger.debug "t.encoding.name (before): #{t.encoding.name}"
-        #####
-        # NB: ASCII-8BIT == BINARY == Encoding Unknown; Raw Bytes Here
-        ## NB:
-        # for now "hardcoded" to utf8 - what else can we do?
-        # - note: force_encoding will NOT change the chars only change the assumed encoding w/o translation
-        t = t.force_encoding( Encoding::UTF_8 )
-        logger.debug "t.encoding.name (after): #{t.encoding.name}"
-        ## pp t
-        t
-      else
-        logger.error "fetch HTTP - #{response.code} #{response.message}"
-        nil
-      end
-    end
-  end # class Page
-end # module Factbook
+# encoding: utf-8
+module Factbook
+## note:
+##   some factbook pages with chrome (headers, footers, etc.)
+##     are NOT valid utf-8, thus,
+##     treat page as is (e.g. ASCII8BIT)
+#
+#   only convert to utf8 when header and footer got stripped
+##
+## be/benin:
+##   Key Force or FC [Lazare S?xx?HOU?xx?TO]     -- two invalid byte code chars in Political parties and leaders:
+#
+##   in Western/Windows-1252  leads to  FC [Lazare SÈHOUÉTO];
+#       Lazare Sèhouéto
+#
+#   looks good - use (assume) Windows-1252 ????
+##
+#   check for is ascii 7-bit ???  if yes -noworries
+#     if not, log number of chars not using ascii 7-bit
+class Page
+  include LogUtils::Logging
+  attr_accessor :sects
+  def initialize
+    @sects = []
+  end
+  def [](key)  ### convenience shortcut
+    # lets you use
+    #   page['geo']
+    #   instead of
+    #   page.data['geo']
+    ##  fix: use delegate data, [] from forwardable lib - why?? why not??
+    data[key]
+  end
+  def data
+    ## note: cache data hash on first build for now
+    if @data.nil?
+      ## convert sects to hash
+      @data = {}
+      sects.each_with_index do |sect,i|
+        @data[ sect.title ] = sect.data
+      end
+    end
+    @data
+  end
+=begin
+def self.from_url( cc, cn )
+  html_ascii = PageFetcher.new.fetch( cc )
+  self.new( cc, cn, html_ascii )
+end
+def self.from_file( cc, cn, opts={} )
+  input_dir = opts[:input_dir] || '.'
+  html_ascii = File.read( "#{input_dir}/#{cc}.html" )    ## fix/todo: use ASCII8BIT/binary reader
+  self.new( cc, cn, html_ascii )
+end
+=end
+end # class Page
+=begin
+class PageFetcher
+def fetch( cc )
+  worker = Fetcher::Worker.new
+  factbook_base = 'https://www.cia.gov/library/publications/the-world-factbook/geos'
+  res = worker.get_response( "#{factbook_base}/#{cc}.html" )
+  # on error throw exception - why? why not??
+  if res.code != '200'
+    raise Fetcher::HttpError.new( res.code, res.message )
+  end
+  ###
+  # Note: Net::HTTP will NOT set encoding UTF-8 etc.
+  #   will be set to ASCII-8BIT == BINARY == Encoding Unknown; Raw Bytes Here
+  html = res.body.to_s
+end
+end # PageFetcher
+=end
+end # module Factbook

data/lib/factbook/sanitizer.rb ADDED Viewed

@@ -0,0 +1,214 @@
+# encoding: utf-8
+module Factbook
+PageInfo = Struct.new( :country_code,
+                       :country_name,
+                       :country_affiliation,
+                       :region_code,
+                       :region_name,
+                       :last_updated )
+class Sanitizer
+  include LogUtils::Logging
+  include Utils     ## pulls in encode_utf8, ...
+def sanitize( html_ascii )
+  ## todo: add option for (html source) encoding - why?? why not??
+  ## note:
+  ##   returns 1) html profile withouth headers, footers, scripts,etc.
+  ##           2) page (meta) info e.g. country_name, country_code, last_updated, etc.
+  ##           3) errors e.g. list of errors e.g. endcoding errors (invalid byte sequence etc.)
+  page_info = PageInfo.new
+  h = find_page_info( html_ascii )
+  page_info.country_code        = h[:country_code]
+  page_info.country_name        = h[:country_name]
+  page_info.country_affiliation = h[:country_affiliation]
+  page_info.region_code         = h[:region_code]
+  page_info.region_name         = h[:region_name]
+  page_info.last_updated        = find_page_last_updated( html_ascii )
+  html_profile_ascii = find_country_profile( html_ascii )    ## cut-off headers, footers, scripts, etc.
+  ## todo/fix: assume windows 12xx encoding!!!! for factbook - try
+  html, errors = encode_utf8( html_profile_ascii )  ## change encoding to utf-8  (from binary/ascii8bit)
+  html = sanitize_profile( html )
+  [html, page_info, errors]
+end
+BEGIN_FACTS_REGEX = /<ul\s+
+                       class="expandcollapse">
+                    /xim    ## ignore case; multi-line
+END_FACTS_REGEX = /<\/li>\s*
+                   <\/ul>\s*
+                   <\/tbody>\s*
+                   <\/table>
+                  /xim      ## ignore case; multi-line
+def find_country_profile( html )
+  ####
+  ## remove header (everything before)
+  ##   <ul class="expandcollapse">
+  pos = html.index( BEGIN_FACTS_REGEX )
+  fail "*** no begin facts marker found for page"  if pos.nil?
+  puts "  bingo - found BEGIN_FACTS on pos #{pos}"
+  html = html[pos..-1]
+  pp html[0..100]
+  ###
+  ## remove footer
+  ##  assume everthings after (last list item in unorder list inside a table body)
+  ##    </li>
+  ##    </ul>
+  ##    </tbody></table>
+  pos = html.index( END_FACTS_REGEX )
+  fail "*** no end facts marker found for page"  if pos.nil?
+  puts "  bingo - found END_FACTS on pos #{pos}"
+  html = html[0...pos] + "</li></ul>\n"        ## note: use ... (not .. to cut-off pos)
+  pp html[-200..-1]
+  html
+end
+STYLE_ATTR_REGEX = /\s*
+                     style=('|").+?\1     ## note: use non-greedy match e.g. .+?
+                   /xim    ## do NOT allow multi-line - why? why not?
+CLASS_ATTR_REGEX =  /\s*
+                     class=('|")(.+?)\1     ## note: use non-greedy match e.g. .+?
+                   /xim    ## do NOT allow multi-line - why? why not?
+##
+## <div>
+##    <span class='category'>country comparison to the world:  </span>
+##    <span class='category_data'>[[191]]</span>
+## </div>
+##
+##  <span class='category'>country comparison to the world:  </span>
+##  <span class='category_data'><a href='../rankorder/2147rank.html#au'>114</a></span>
+## todo: add enclosing div too!!!
+COUNTRY_COMPARISON_REGEX = /
+        <div>
+         <span \s class='category'[^>]*>
+           country \s comparison \s to \s the \s world: \s*
+         <\/span>
+          \s*
+         <span \s class='category_data'[^>]*>
+          \s*
+            <a \s [^>]+>
+             .+?
+            <\/a>
+          \s*
+         <\/span>
+         <\/div>
+        /xim
+##
+##  <div class='wrap'>
+##     <div class='audio-player'>
+##    <audio id='audio-player-1' class='my-audio-player' src='../anthems/AU.mp3' type='audio/mp3' controls='controls'>
+##    </audio>
+##  </div></div>
+AUDIO_PLAYER_REGEX = /
+        <div \s class='wrap'>
+        <div \s class='audio-player'>
+          <audio \s [^>]+>
+          <\/audio>
+        <\/div>
+        <\/div>
+         /xim
+def sanitize_profile( html )
+  html = html.gsub( STYLE_ATTR_REGEX ) do |m|
+          puts "remove style attr:"
+          puts "#{m}"
+          ''
+        end
+  html = html.gsub( AUDIO_PLAYER_REGEX ) do |m|
+          puts "remove audio player:"
+          puts "#{m}"
+          ''
+        end
+  html = html.gsub( COUNTRY_COMPARISON_REGEX ) do |m|
+          puts "remove country comparison:"
+          puts "#{m}"
+          ''
+        end
+  ## remove/cleanup anchors (a href)
+  html = html.gsub( /<a\s+href[^>]*>(.+?)<\/a>/im ) do |_|   ## note: use .+? non-greedy match
+    puts " replace anchor (a) href >#{$1}<"
+    inner_text = $1.dup ## keep a copy
+    if inner_text =~ /<img/    ## if includes image remove
+      puts "  remove image in anchor"
+      ''
+    else    ## keep inner text
+      inner_text
+    end
+  end
+  ## remove all list e.g. ul/li
+  html = html.gsub( /<\/?(li|ul)[^>]*>/im ) do |m|
+    puts " remove list >#{m}<"
+    ''
+  end
+  ## clean-up class attrib e.g. remove unknown classes
+  html = html.gsub( CLASS_ATTR_REGEX ) do |m|
+          puts "cleanup class attr:"
+          puts "#{m}"
+          klasses = $2.split(' ')
+          klasses = klasses.select do |klass|
+            if ['region', 'category', 'category_data'].include?( klass )
+              true
+            else
+              puts "  remove class #{klass}"
+              false
+            end
+          end
+          if klasses.size > 0
+            " class='#{klasses.join(' ')}'"   ## note: add leading space!!
+          else
+            ''   ## remove class attrib completely
+          end
+        end
+   html
+end
+end # class Sanitizer
+end # module Factbook