RubyGems - factbook - Versions diffs - 1.1.1 → 2.0.1 - Mend

factbook 1.1.1 → 2.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (64) hide show

checksums.yaml +4 -4
data/{HISTORY.md → CHANGELOG.md} +3 -3
data/Manifest.txt +1 -58
data/README.md +50 -575
data/Rakefile +29 -33
data/lib/factbook.rb +8 -75
metadata +20 -114
data/data/attributes.yml +0 -337
data/data/categories.csv +0 -164
data/data/codes.csv +0 -262
data/data/codesxref.csv +0 -280
data/data/comparisons.csv +0 -75
data/lib/factbook/almanac.rb +0 -72
data/lib/factbook/attributes.rb +0 -74
data/lib/factbook/builder.rb +0 -214
data/lib/factbook/builder_item.rb +0 -92
data/lib/factbook/builder_json.rb +0 -79
data/lib/factbook/codes.rb +0 -119
data/lib/factbook/comparisons.rb +0 -50
data/lib/factbook/counter.rb +0 -48
data/lib/factbook/db/importer.rb +0 -92
data/lib/factbook/db/models.rb +0 -11
data/lib/factbook/db/schema.rb +0 -36
data/lib/factbook/normalize.rb +0 -43
data/lib/factbook/page.rb +0 -185
data/lib/factbook/page_info.rb +0 -12
data/lib/factbook/reader_json.rb +0 -51
data/lib/factbook/sanitizer.rb +0 -207
data/lib/factbook/sect.rb +0 -29
data/lib/factbook/subsect.rb +0 -18
data/lib/factbook/table.rb +0 -52
data/lib/factbook/utils.rb +0 -85
data/lib/factbook/utils_info.rb +0 -102
data/lib/factbook/version.rb +0 -22
data/script/almanac.rb +0 -48
data/script/attributes.rb +0 -34
data/script/build.rb +0 -28
data/script/counter.rb +0 -145
data/script/json.rb +0 -18
data/script/testbr.rb +0 -33
data/script/testcodes.rb +0 -11
data/test/data/au.html +0 -579
data/test/data/au.yml +0 -8
data/test/data/be.html +0 -596
data/test/data/be.yml +0 -8
data/test/data/json/au.json +0 -892
data/test/data/src/au.html +0 -2006
data/test/data/src/be.html +0 -2011
data/test/helper.rb +0 -11
data/test/test_attribs.rb +0 -82
data/test/test_attribs_def.rb +0 -20
data/test/test_builder.rb +0 -35
data/test/test_codes.rb +0 -76
data/test/test_comparisons.rb +0 -19
data/test/test_convert.rb +0 -30
data/test/test_counter.rb +0 -31
data/test/test_fields.rb +0 -52
data/test/test_importer.rb +0 -55
data/test/test_item_builder.rb +0 -99
data/test/test_json.rb +0 -44
data/test/test_json_builder.rb +0 -25
data/test/test_normalize.rb +0 -23
data/test/test_page.rb +0 -38
data/test/test_sanitizer.rb +0 -35

data/lib/factbook/sect.rb DELETED

@@ -1,29 +0,0 @@
-# encoding: utf-8
-module Factbook
-class Sect
-  include LogUtils::Logging
-  attr_accessor :title        ## use name instead of title - why? why not?
-  attr_accessor :subsects
-  def initialize
-    @subsects = []
-  end
-  def data
-    ## convert sects to hash
-    @data = {}
-    subsects.each_with_index do |subsect,i|
-      @data[ subsect.title ] = subsect.data
-    end
-    @data
-  end
-end # class Sect
-end # module Factbook

data/lib/factbook/subsect.rb DELETED

@@ -1,18 +0,0 @@
-# encoding: utf-8
-module Factbook
-class Subsect
-  include LogUtils::Logging
-  attr_accessor :title        ## use name instead of title - why? why not?
-  attr_accessor :data         ## hash holding data e.g. { 'text' => '...' etc. }
-  def initialize
-    @data = {}
-  end
-end # class Subsect
-end # module Factbook

data/lib/factbook/table.rb DELETED

@@ -1,52 +0,0 @@
-# encoding: utf-8
-module Factbook
-##
-## make more "generic"  - why? why not?
-##   (re)use for other files ?? move to textutils ??
-##
-##  for now reads in rows with values separated by at least 3+ spaces e.g.:
-##   see www.cia.gov/library/publications/the-world-factbook/rankorder/rawdata_2119.txt
-## 1      China                      1,367,485,388
-## 2      India                      1,251,695,584
-## 3      European Union             513,949,445
-## 4      United States              321,368,864
-## 5      Indonesia                  255,993,674
-## 6      Brazil                     204,259,812
-class TableReader
-  include LogUtils::Logging
-def initialize( text )
-  @text = text
-end
-def read
-  recs = []
-  line_no = 0
-  @text.each_line do |line|
-    line_no +=1
-    line = line.strip   ## remove leading and trailing whitespace
-    if line.empty?
-      puts "** skipping empty line #{line_no}"
-      next
-    end
-    values = line.split( /[ ]{3,}/ )    ## split three or more spaces - use just two ?? why? why not??
-    ## puts line
-    ## pp values
-    recs << values
-  end
-  recs
-end
-end # class TableReader
-end # module Factbook

data/lib/factbook/utils.rb DELETED

@@ -1,85 +0,0 @@
-# encoding: utf-8
-module Factbook
-  module Utils
-########################################
-## todo: move to textutils - why, why not ?????
-def encode_utf8( text )
-  errors = []   ## also return list of encoding errors
-  ## note: factbook claims utf-8  - but includes invalid bytes in some pages
-  ##   encoding is likley wester/windows-
-  ## note:
-  ##   use �    - unknown/invalid unicode char
-  ##  fix/todo: use ASCII-8BIT instead of binnary
-  text = text.encode('UTF-8', 'binary', :invalid => :replace,
-                                        :undef   => :replace,
-                                        :replace => '�' )
-  ## check for replaced/invalid chars and log warrning
-  pos = text.index( '�' )
-  while pos
-    from = pos-10   ## tood/fix: use min/max to check for bounds - why? why not??
-    to   = pos+10
-    around = text[from..to]
-    puts "  pos #{pos}, from #{from}, to #{to}, around >#{around}<"
-    msg  = "invalid char on pos #{pos} around: >#{around}<"
-    puts msg
-    ## also log message / w timestamp
-    errors << "#{Time.now} - #{msg}"
-    pos = text.index( '�', pos+1 )
-  end
-  [text,errors]   ## return text and errors (list)
-end
-def values_to_csv( values )
-  buf = ""
-  values.each_with_index do |value,i|
-     buf << ','  if i > 0    ## add comma (except for first value)
-     ## note: allow optional $ sign e.g. $100,000,000
-     ##  !!!! todo/fix: allow optional minus e.g. -44,000
-     if value =~ /^\$?[1-9][,0-9]+[0-9]$/    ### find a better regex - why? why not??
-       ## check if number e.g. 17,098,242  or $17,098,242
-       ##   remove commas  17098242
-       buf << value.gsub( ',', '' )
-     elsif value.index( ',').nil?
-       ## add as is 1:1 (no commana)
-       buf << value
-     else
-       ## escape comma with double quote
-       #   e.g. Guam, The becomes "Guam, The"
-       buf << '"'
-       buf << value
-       buf << '"'
-     end
-  end
-  buf
-end
-def data_to_csv( recs, headers )
-  text = ""
-  text << values_to_csv( headers )
-  text << "\n"
-  recs.each do |rec|
-    text << values_to_csv( rec )
-    text << "\n"
-  end
-  text
-end
-  end   # module Utils
-end     # module Factbook

data/lib/factbook/utils_info.rb DELETED

@@ -1,102 +0,0 @@
-# encoding: utf-8
-module Factbook
-  module Utils
-#######
-## find meta data (about page info)
-#### e.g. Page last updated on September 16, 2015
-MONTH_EN_TO_S={
-  'January'   => '1',
-  'February'  => '2',
-  'March'     => '3',
-  'April'     => '4',
-  'May'       => '5',
-  'June'      => '6',
-  'July'      => '7',
-  'August'    => '8',
-  'September' => '9',
-  'October'   => '10',
-  'November'  => '11',
-  'December'  => '12'
-}
-PAGE_LAST_UPDATED_REGEX = /
-                           Page \s last \s updated \s on \s
-                            (?<month_en>[a-z]+) \s
-                            (?<day>\d{1,2}), \s
-                            (?<year>\d{4})
-                          /imx
-def find_page_last_updated( html )
-  m = PAGE_LAST_UPDATED_REGEX.match( html )
-  if m
-    pp m
-    month_en = m[:month_en]
-    day      = m[:day]
-    year     = m[:year]
-    puts "** bingo - month #{month_en}, day #{day}, year #{year}"
-    month = MONTH_EN_TO_S[ month_en ]
-    date_str = "#{year}-#{month}-#{day}"
-    pp date_str
-    date = Date.strptime( date_str, '%Y-%m-%d' )
-    date
-  else
-    nil
-  end
-end
-##
-## e.g. regioncode="eur"
-##      countrycode="au"
-##      countryname="Austria"
-##      flagsubfield=""
-##      countryaffiliation=""
-##      flagdescription=""
-##      flagdescriptionnote=""
-##      region="Europe"
-##
-##   note: countryaffiliation may be empty
-PAGE_INFO_REGEX = /
-             regioncode=(?<q1>"|')(?<region_code>.+?)\k<q1>
-               \s+
-             countrycode=(?<q2>"|')(?<country_code>.+?)\k<q2>       ## is k<3> backref
-               \s+
-              countryname=(?<q3>"|')(?<country>.+?)\k<q3>
-               \s+
-                [^>]+?  ## allow any attribs (note: non-greedy)
-              countryaffiliation=(?<q4>"|')(?<affiliation>.*?)\k<q4>     ## note: might be empty
-               \s+
-                [^>]+?  ## allow any attribs (note: non-greedy)
-              region=(?<q5>"|')(?<region>.+?)\k<q5>    ## check world - might be empty ?? or for ocean ??
-           /imx
-def find_page_info( html )
-  m = PAGE_INFO_REGEX.match( html )
-  if m
-    pp m
-    h = { country_code:        m[:country_code],
-          country_name:        m[:country],
-          country_affiliation: m[:affiliation],
-          region_code:         m[:region_code],
-          region_name:         m[:region] }
-    puts "** bingo - #{h.inspect}"
-    h    ## return hash w/ name-value pairs
-  else
-    nil   ## or return empty struct with nils/empty strings - why?? why not??
-  end
-end
-  end   # module Utils
-end     # module Factbook

data/lib/factbook/version.rb DELETED

@@ -1,22 +0,0 @@
-# encoding: utf-8
-module Factbook
-  MAJOR = 1
-  MINOR = 1
-  PATCH = 1
-  VERSION = [MAJOR,MINOR,PATCH].join('.')
-  def self.version
-    VERSION
-  end
-  def self.banner
-    "factbook/#{VERSION} on Ruby #{RUBY_VERSION} (#{RUBY_RELEASE_DATE}) [#{RUBY_PLATFORM}]"
-  end
-  def self.root
-    "#{File.expand_path( File.dirname(File.dirname(File.dirname(__FILE__))) )}"
-  end
-end

data/script/almanac.rb DELETED

@@ -1,48 +0,0 @@
-# encoding: utf-8
-#
-#  use to run:
-#   ruby -I ./lib script/almanac.rb
-require 'factbook'
-TEMPLATE = <<EOS
-### <%= names %>
-<%= page.name_long=='none' ? '\-' : page.name_long %> › <%= page.map %> -- <%= page.location %> <br>
-<%= page.capital %> • <%= page.area %> • pop. <%= page.population %>
-**Languages:** <%= page.languages %>
-**Major cities:** <%= page.major_cities %>
-**Ethnic groups:** <%= page.ethnic_groups %>
-**Religions:** <%= page.religions %>
-**Independence:** <%= page.independence %>
-**Internet:** `<%= page.internet %>` • <%= page.internet_users %> • <%= page.internet_users_rate %>
-**Telephones - mobile:** <%= page.telephones_mobile %> • <%= page.telephones_mobile_subscriptions %> subs./100
-EOS
-#########################
-### read all countries
-###   using local json (dump) files
-##  see github.com/factbook/factbook.json   (use git clone)
-json_dir = '../../factbook/factbook.json'
-codes    = Factbook.codes.countries
-## todo: add tawain and ?? from others - why, why not??
-pages   = Factbook::JsonPageReader.new( json_dir ).read_pages( codes )
-almanac = Factbook::Almanac.new( pages )
-## save to disk
-File.open( './tmp/ALMANAC.md', 'w' ) do |f|
-  f.write almanac.render( TEMPLATE )
-end
-puts "Done."

data/script/attributes.rb DELETED

@@ -1,34 +0,0 @@
-# encoding: utf-8
-#
-#  use to run:
-#   ruby -I ./lib script/attributes.rb
-#  e.g. prints attribute accessor shortcuts
-#
-#  ### Geography
-#
-#  - `location`  =>  Location
-#  - `coords`  =>  Geographic coordinates
-#  - `map`  =>  Map references
-#  ...
-require 'factbook'
-attribs= Factbook.attributes.to_a
-h = attribs.group_by { |a| a.category }
-pp h
-h.each do |k,v|
-  puts ""
-  puts "### #{k}"
-  puts ""
-  v.each do |a|
-    puts "- `#{a.name}`  =>  #{a.path.join(' › ')}"
-  end
-end

data/script/build.rb DELETED

@@ -1,28 +0,0 @@
-# encoding: utf-8
-#
-#  use to run/test:
-#   ruby -I ./lib script/build.rb
-require 'factbook'
-DB_CONFIG = {
-  adapter:  'sqlite3',
-  database: './factbook.db'
-}
-ActiveRecord::Base.logger = Logger.new( STDOUT )
-ActiveRecord::Base.establish_connection( DB_CONFIG )
-Factbook::CreateDb.new.up    ## create tables
-importer = Factbook::Importer.new
-Factbook.codes.each do |code|
-  puts "Fetching #{code.code}- #{code.name}..."
-  page = Factbook::Page.new( code.code )
-  puts "Adding #{code.code}- #{code.name}..."
-  importer.import( page )
-end
-puts "Done."

data/script/counter.rb DELETED

@@ -1,145 +0,0 @@
-# encoding: utf-8
-#
-#  use to run:
-#   ruby -I ./lib script/counter.rb
-require 'factbook'
-c = Factbook::Counter.new
-##  see github.com/factbook/factbook.json   (use git clone)
-json_dir = '../../factbook/factbook.json'
-codes    = Factbook.codes
-pages   = Factbook::JsonPageReader.new( json_dir ).read_pages( codes )
-pages.each do |page|
-  c.count( page )
-end
-h = c.data
-pp h
-### save to json
-puts "saving a copy to categories.json for debugging"
-File.open( "tmp/categories.json", 'w' ) do |f|
-  f.write JSON.pretty_generate( h )
-end
-SKIP_CATEGORIES_LINES=<<EOS
-######
-### france plus 5 overseas regions/departments
-##  metropolitan France
-## metropolitan France - total
-overseas departments
-French Guiana
-French Guiana - total
-Guadeloupe
-Guadeloupe and Martinique
-Martinique
-Mayotte
-Reunion
-###############
-### more
-Iles Eparses
-Ile Amsterdam
-Ile Amsterdam (Ile Amsterdam et Ile Saint-Paul)
-Ile Amsterdam et Ile Saint-Paul
-Ile Saint Paul
-Ile Saint-Paul (Ile Amsterdam et Ile Saint-Paul)
-Iles Crozet
-Iles Kerguelen
-Adelie Land
-Bassas da India
-Bassas da India (Iles Eparses)
-Bassas da India, Europa Island, Glorioso Islands, Juan de Nova Island (Iles Eparses)
-Europa Island
-Europa Island (Iles Eparses)
-Europa Island, Glorioso Islands, Juan de Nova Island
-Europa Island and Juan de Nova Island (Iles Eparses)
-Europa Island, Glorioso Islands, Juan de Nova Island (Iles Eparses)
-Glorioso Islands
-Glorioso Islands (Iles Eparses)
-Glorioso Island (Iles Eparses)
-Juan de Nova Island
-Juan de Nova Island (Iles Eparses)
-Tromelin Island
-Tromelin Island (Iles Eparses)
-Saint Helena
-Ascension Island
-Ascension
-Tristan da Cunha
-Tristan da Cunha island group
-Baker Island
-Baker, Howland, and Jarvis Islands
-Baker, Howland, and Jarvis Islands, and Johnston Atoll
-Baker, Howland, and Jarvis Islands, and Kingman Reef
-Howland Island
-Jarvis Island
-Johnston Atoll
-Johnston Atoll and Kingman Reef
-Kingman Reef
-Midway Islands
-Midway Islands, Johnston, and Palmyra Atolls
-Midway Islands and Palmyra Atoll
-Palmyra Atoll
-note on Palmyra Atoll
-EOS
-##   allow empty lines and skip comments
-SKIP_CATEGORIES = SKIP_CATEGORIES_LINES.split("\n").select { |item| !(item =~ /^\s*$/ || item =~ /^\s*#/) }
-def print_categories( data )
-  data.each do |k,v|
-    puts ""
-    puts "## #{k} _(#{v[:count]})_"
-    puts ""
-    walk_categories( v, 1 )
-  end
-end
-def walk_categories( data, level )
-  data.each do |k,v|
-    next if k == :count || k == :codes   ## skip "virtual" count entry (added for stats)
-    ## skip (sub)country entries e.g. Baker Island, Ile Amsterdam, etc.
-    next if  SKIP_CATEGORIES.include?( k )
-    print "     " * (level-1)   if level > 1    ## add 4 spaces indents per extra level
-    print "- "
-    print "**"                  if level == 1    ## mark as bold
-    print k
-    print "**"                  if level == 1
-    print " _("
-    print v[:count]
-    if v[:codes]     ##  add codes if present
-      print " - "
-      print v[:codes]
-    end
-    print ")_"
-    print "\n"
-    walk_categories( v, level+1)
-  end
-end
-print_categories( c.data )
-puts "Done."