RubyGems - factbook - Versions diffs - 1.2.1 → 1.2.2 - Mend

factbook 1.2.1 → 1.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: ecdb08c833f823392dd8bacf5f0ca6186015ebb4
-  data.tar.gz: c6829c25c02f075ef65e129bc396b1d2b3aba8f0
+  metadata.gz: 5ef14a3d11b87b854592f81c50406e43f4399ac3
+  data.tar.gz: 8c4eb43f4dfdca5b20d4dd60699b51c52a8cc7ff
 SHA512:
-  metadata.gz: c40f0716ae27cc3c6d1a2a8ea9d2cdaf66b421e649396a9c141fa0b76539afed15fb694a567f7d8f705664bdf24751b8eb92b6b8bcda1c1a3869a5c2dbad75f4
-  data.tar.gz: 456b32abafb674a3e0e4205cf70d97b6fc449e4c286e29c5cb96d25da72a75184c87c0aa0fee58d483747fd16c33d8a8287d77b55a1cfce8bb18c84534021439
+  metadata.gz: 57066d2a00d4818330ca710947fa5fedfb80ad224b84d31d906eb7606ba8127b8c1f102405a61f6dd3742b3c6f0fa4ec10f290a8d1da3367747a85ca31279a14
+  data.tar.gz: 6aa9fa498d6a1f836300e3fe985a5abd2cf8770211d414427608d2515f3d4fa56108b798e17a7e8d2b0f1bc52f8347787fd431cc97a099877eeafd89fffb12af

data/lib/factbook/builder.rb CHANGED

@@ -36,7 +36,7 @@ attr_reader :html_ascii,     ## full "original" 1:1 page in "original/ascii8/bin
 def initialize( html_ascii )
   @html_ascii = html_ascii
   ## todo/fix: use/assume windows 12xx?? encoding - change encoding to utf-8  (from binary/ascii8bit)
   @html, @info, @errors = Sanitizer.new.sanitize( @html_ascii )
@@ -53,7 +53,7 @@ def initialize( html_ascii )
     html_subsects  = html_sect[1]
     puts html_sect_head
     puts html_subsects.size
     ## get section title
     ##  @SECTION{Economy}  => Economy
     if html_sect_head =~ /@SECTION{(.+?)}/
@@ -74,7 +74,7 @@ def initialize( html_ascii )
           puts title
           subsect = Subsect.new
           subsect.title = title     ## todo/fix: cut off trailing colon (:)
           b = Factbook::ItemBuilder.new( html_subsect_body, title )
           h = b.read
           subsect.data = h
@@ -88,32 +88,23 @@ def initialize( html_ascii )
       @sects << sect
     else
       ## warn/fix:  no section title found
-    end
+    end
   end
   self  ## return self -- needed?? default (standard) anyway?? check and remove
 end
 def map_sects( html )
-   ## convert section titles
-   ##   from  <h2>..</h2>
-   ##   to "unified" marker
-  ## e.g.
-  ##  <h2 sectiontitle='Introduction' ccode='au'>Introduction ::  <span class='region'>AUSTRIA </span></h2>
-  ##  <h2>Introduction</h2>
+   ## convert section titles to "unified" marker
+   ## e.g.
+   ##   <h2>Introduction</h2>
-  title_regex= /<h2
-                 (?:\s[^>]+)?  ## allow optional attributes in h2
-                 >
+  title_regex= /<h2>
                  \s*
-                   ([^<>]+?)  ## note: use non-greedy; do NOT allow tags inside for now
+                   (.+?)  ## note: use non-greedy; do NOT allow tags inside for now
                  \s*
-                 (?:\s::\s
-                   .+?       ## note: use non-greedy; allows tags inside
-                 )?          ## strip optional name (e.g.  :: AUSTRIA)
                 <\/h2>
               /xim
@@ -121,33 +112,29 @@ def map_sects( html )
      puts "** found section >#{$1}<:"
      puts "   >|#{m}|<"
-     "\n\n@SECTION{#{$1}}\n\n"
+     "\n\n@SECTION{#{$1}}\n\n"
   end
   html
 end
 def map_subsects( html )
-   ## convert subsection titles
-   ##   from  <div id='field'>..</div>
-   ##   to "unified" marker
+   ## convert subsection titles to "unified" marker
+   ## e.g.
+   ##  <h3>Disputes - international:</h3>
-  ## e.g.
-  ##  <div id='field' class='category'>Disputes - international:</div>
-  title_regex= /<div \s id='field'
-                     \s class='category'>
-                   \s*
+  title_regex= /<h3>
+                  \s*
                    (.+?)                ## note: use non-greedy; allows tags inside - why? why not
-                   \s*
-                 <\/div>
+                  \s*
+                 <\/h3>
                /xim
   html = html.gsub( title_regex ) do |m|
      puts "** found subsection >#{$1}<:"
      puts "   >|#{m}|<"
-     "\n@SUBSECTION{#{$1}}\n"
+     "\n@SUBSECTION{#{$1}}\n"
   end
   html
 end
@@ -166,9 +153,9 @@ def split_sects( html )
   ##   String#split will include all catpure groups in the result array
   section_regex= /(@SECTION{.+?})/  ## note: use non-greedy -- check: need to escape {} ??
   chunks = html.split( section_regex )
   ## check if first item is a section or (html) prolog
   #   if prolog (remove)
   chunks.slice!(0)  unless chunks[0] =~ /@SECTION/  ## starts w/ @SECTION
@@ -195,9 +182,9 @@ def split_subsects( html )
   ## note: "wrap" regex in a capture group (just one)
   ##   String#split will include all catpure groups in the result array
   subsection_regex= /(@SUBSECTION{.+?})/  ## note: use non-greedy -- check: need to escape {} ??
   chunks = html.split( subsection_regex )
   ## check if first item is a section or (html) prolog

data/lib/factbook/sanitizer.rb CHANGED

@@ -209,6 +209,35 @@ REL_AFFILIATION_CATEGORY_REGEX = /
     /xim
+##########################################
+## transforms / simplify
+##
+## <h2 sectiontitle='Introduction' ccode='ag'>
+##   Introduction ::  <span class='region'>ALGERIA </span>
+## </h2>
+##   becomes =>
+## <h2>Introduction</h2>
+##
+##  todo/fix: use named capture in future e.g.
+##   (?<text>.+?)  instead of  (.+?)
+##   not working for now w/ gsub (just passed in match string NOT match data)
+CLEANUP_SECTION_REGEX = /
+     <h2 [^>]*>
+       (.+?)
+     <\/h2>
+    /xim
+##
+## <div id='field' class='category'>Electricity - consumption:</div>
+##   becomes =>
+## <h3>Electricity - consumption:</h3>
+CLEANUP_SUBSECTION_REGEX = /
+     <div \s id='field' [^>]*>
+       (.+?)
+     <\/div>
+    /xim
@@ -285,7 +314,7 @@ def sanitize_profile( html )
           klasses = $2.split(' ')
           klasses = klasses.select do |klass|
-            if ['region', 'category', 'category_data'].include?( klass )
+            if ['category', 'category_data'].include?( klass )
               true
             else
               puts "  remove class #{klass}"
@@ -300,6 +329,33 @@ def sanitize_profile( html )
           end
         end
+    ##################################################################
+    ## simplify/cleanup section and subsection headings
+    html = html.gsub( CLEANUP_SECTION_REGEX ) do |_|
+       puts " cleanup section (h2) heading >#{$1}<"
+       text = $1
+       pos = text.index( '::' )
+       if pos   ## if includes =>  :: <span> Region </span>  -- cut off
+         puts "    remove :: region/country from heading"
+         text = text[0...pos]
+       end
+       text = text.strip   # remove trailing space too
+       "<h2>#{text}</h2>"
+    end
+    html = html.gsub( CLEANUP_SUBSECTION_REGEX ) do |_|
+       puts " cleanup subsection (h3) heading >#{$1}<"
+       text = $1
+       text = text.strip   # remove trailing space too
+       "<h3>#{text}</h3>"
+    end
    html
 end

data/lib/factbook/version.rb CHANGED

@@ -4,7 +4,7 @@ module Factbook
   MAJOR = 1
   MINOR = 2
-  PATCH = 1
+  PATCH = 2
   VERSION = [MAJOR,MINOR,PATCH].join('.')
   def self.version

data/test/test_sanitizer.rb CHANGED

@@ -15,6 +15,7 @@ class TestSanitizer < MiniTest::Test
     ## austria (au)
     ## algeria (ag)
     ## belgium (be)
+    ## ['au'].each do |cnty|
     ['au','ag','be'].each do |cnty|
       ## use/fix: ASCII-8BIT (e.g.keep as is) -???

metadata CHANGED

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: factbook
 version: !ruby/object:Gem::Version
-  version: 1.2.1
+  version: 1.2.2
 platform: ruby
 authors:
 - Gerald Bauer
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2016-11-06 00:00:00.000000000 Z
+date: 2016-11-08 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: logutils