factbook-readers 0.0.1 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: dc307d415f957d373118337b72baa5ca9c0b5686
4
- data.tar.gz: f5241909514a895878e29b1e9e3dd0f3fddf9088
3
+ metadata.gz: d9bc3eaf2cb6fa3774e0b7a25b53336be2b05a55
4
+ data.tar.gz: 86565dc92913645110beec803d7bc0a7c088155f
5
5
  SHA512:
6
- metadata.gz: cd89c3f31089bb3256969076a69fe2dcc3e04f762033cec14e621d5bbff289f72aecd4f6a67e0c32b864588c56f3a57e8059f9957f885293875a95695dd22059
7
- data.tar.gz: 5f55a31397cbfa186cb85597ec280a2fc94fe63e8b595ccf18e36e2168dff531e19d0a64a63281fba380356df23b56769318aecf1d84c52277e511e5cb21998f
6
+ metadata.gz: 755b8727d0bbcaecd97f52064b1b29321e1b59a72bff55bbdd995ed8968732def7480f4cab0f222bf4c9d163afbd5230647237b96d41dc136006f0f9a9473550
7
+ data.tar.gz: 638dcf4f4a552c705a743c9e7483e457303d2090a9204ee3d4b390c3256b537050ea4fdc0957cba461dcee549ecf52b541c4b691dcb8c500c4439eaf376d4a87
@@ -2,13 +2,11 @@ CHANGELOG.md
2
2
  Manifest.txt
3
3
  README.md
4
4
  Rakefile
5
- data/attributes.yml
6
5
  data/categories.csv
7
6
  data/codes.csv
8
7
  data/codesxref.csv
9
8
  data/comparisons.csv
10
9
  lib/factbook-readers.rb
11
- lib/factbook-readers/attributes.rb
12
10
  lib/factbook-readers/builder.rb
13
11
  lib/factbook-readers/builder_item.rb
14
12
  lib/factbook-readers/builder_json.rb
@@ -27,26 +25,12 @@ lib/factbook-readers/utils.rb
27
25
  lib/factbook-readers/utils_info.rb
28
26
  lib/factbook-readers/version.rb
29
27
  lib/factbook/readers.rb
30
- test/data/au.html
31
- test/data/au.yml
32
- test/data/be.html
33
- test/data/be.yml
34
- test/data/json/au.json
35
- test/data/src/ag.html
36
- test/data/src/au-2015-09-24.html
37
- test/data/src/au.html
38
- test/data/src/be-2015-09-24.html
39
- test/data/src/be.html
40
28
  test/helper.rb
41
- test/test_attribs.rb
42
- test/test_attribs_def.rb
43
29
  test/test_builder.rb
44
30
  test/test_codes.rb
45
31
  test/test_comparisons.rb
46
- test/test_convert.rb
47
32
  test/test_counter.rb
48
33
  test/test_fields.rb
49
- test/test_importer.rb
50
34
  test/test_item_builder.rb
51
35
  test/test_json.rb
52
36
  test/test_json_builder.rb
data/README.md CHANGED
@@ -55,36 +55,35 @@ resulting in:
55
55
  ...
56
56
  ```
57
57
 
58
- ### Use shortcut attribute accessors
58
+ ### Use data attributes
59
59
 
60
60
  ```ruby
61
- pp page.background ## same as page['Introduction']['Background']['text']
61
+ pp page['Introduction']['Background']['text']
62
62
  # => "Following more than three centuries..."
63
- pp page.area ## same as page['Geography'][''Area']['total']['text']
63
+ pp page['Geography']['Area']['total']['text']
64
64
  # => "8,515,770 sq km"
65
- pp page.area_land ## same as page['Geography'][''Area']['land']['text']
65
+ pp page['Geography']['Area']['land']['text']
66
66
  # => "8,358,140 sq km"
67
- pp page.area_water ## same as page['Geography'][''Area']['water']['text']
67
+ pp page['Geography']['Area']['water']['text']
68
68
  # => "157,630 sq km"
69
- pp page.area_note ## same as page['Geography'][''Area']['note']['text']
69
+ pp page['Geography']['Area']['note']['text']
70
70
  # => "includes Arquipelago de Fernando de Noronha, Atol das Rocas, ..."
71
- pp page.area_comparative ## same as page['Geography']['Area - comparative']['text']
71
+ pp page['Geography']['Area - comparative']['text']
72
72
  # => "slightly smaller than the US"
73
- pp page.climate ## same as page['Geography']['Climate']['text']
73
+ pp page['Geography']['Climate']['text']
74
74
  # => "mostly tropical, but temperate in south"
75
- pp page.terrain ## same as page['Geography']['Terrain']['text']
75
+ pp page['Geography']['Terrain']['text']
76
76
  # => "mostly flat to rolling lowlands in north; ..."
77
- pp page.elevation_lowest ## same as page['Geography']['Elevation extremes']['lowest point']['text']
77
+ pp page['Geography']['Elevation extremes']['lowest point']['text']
78
78
  # => "Atlantic Ocean 0 m"
79
- pp page.elevation_highest ## same as page['Geography']['Elevation extremes']['highest point']['text']
79
+ pp page['Geography']['Elevation extremes']['highest point']['text']
80
80
  # => "Pico da Neblina 2,994 m"
81
- pp page.resources ## same as page['Geography'][Natural resources']['text']
81
+ pp page['Geography']['Natural resources']['text']
82
82
  # => "bauxite, gold, iron ore, manganese, nickel, phosphates, ..."
83
83
  ...
84
84
  ```
85
85
 
86
- See [`data/attributes.yml`](data/attributes.yml) for the full listing of all built-in attribute shortcut accessors.
87
- See [Attributes](ATTRIBUTES.md) for a quick reference listing.
86
+ See [Attributes](../ATTRIBUTES.md) for a quick reference listing.
88
87
 
89
88
 
90
89
  ### Save to disk as JSON
@@ -216,7 +216,7 @@ sb,Saint Pierre and Miquelon,Dependencies (France),North America
216
216
  wf,Wallis and Futuna,Dependencies (France),Australia-Oceania
217
217
  aa,Aruba,Dependencies (Netherlands),Central America and Caribbean
218
218
  uc,Curacao,Dependencies (Netherlands),Central America and Caribbean
219
- sk,Sint Maarten,Dependencies (Netherlands),Central America and Caribbean
219
+ nn,Sint Maarten,Dependencies (Netherlands),Central America and Caribbean
220
220
  cw,Cook Islands,Dependencies (New Zealand),Australia-Oceania
221
221
  ne,Niue,Dependencies (New Zealand),Australia-Oceania
222
222
  tl,Tokelau,Dependencies (New Zealand),Australia-Oceania
@@ -17,22 +17,21 @@ require 'factbook-readers/version' # let it always go first
17
17
 
18
18
  require 'factbook-readers/codes'
19
19
  require 'factbook-readers/comparisons'
20
- require 'factbook-readers/attributes'
21
20
 
22
- module Factbook
23
-
24
- ## auto-load builtin codes, comparisons, attributes, etc.
25
- CODES = Codes.from_csv( "#{Factbook::Module::Readers.root}/data/codes.csv" )
26
- COMPARISONS = Comparisons.from_csv( "#{Factbook::Module::Readers.root}/data/comparisons.csv" )
27
- ATTRIBUTES = Attributes.from_yaml( "#{Factbook::Module::Readers.root}/data/attributes.yml" )
28
-
29
- def self.codes() CODES; end
30
- def self.comparisons() COMPARISONS; end
31
- def self.attributes() ATTRIBUTES; end
32
21
 
22
+ ## note: make codes, comparisons available
23
+ module Factbook
24
+ ## note: load on demand only builtin codes, comparisons, etc.
25
+ ## for now
26
+ def self.codes
27
+ @@codes ||= Codes.read_csv( "#{Factbook::Module::Readers.root}/data/codes.csv" );
28
+ end
29
+ def self.comparisons
30
+ @@comparisons ||= Comparisons.read_csv( "#{Factbook::Module::Readers.root}/data/comparisons.csv" )
31
+ end
33
32
  end # module Factbook
34
33
 
35
- ## note: make codes, comparisons, attributes available
34
+
36
35
 
37
36
  require 'factbook-readers/utils'
38
37
  require 'factbook-readers/utils_info'
@@ -1,49 +1,29 @@
1
- # encoding: utf-8
2
1
 
3
2
  module Factbook
4
3
 
5
- class Builder ## todo: change to PageBuilder ???
4
+ class Builder ## todo: change to HtmlBuilder or PageBuilder ???
6
5
  include LogUtils::Logging
7
6
 
8
7
 
9
- =begin
10
- def self.from_cc( cc, opts={} ) ## rename to from_file_for_country() or from_file_for_cc() or something - why?? why not??
11
- ## check/todo: rename input_dir to just dir or to include ?
12
- ## (there's no output_dir)?? - why? why not?
13
- input_dir = opts[:input_dir] || '.'
14
- self.from_file( "#{input_dir}/#{cc}.html" )
15
- end
16
- =end
17
-
18
-
19
- def self.from_file( path )
20
- html_ascii = File.read( path ) ## fix/todo: use ASCII8BIT/binary reader !!!!!
21
- self.from_string( html_ascii )
22
- end
23
-
24
- def self.from_string( html_ascii ) ## note: expects ASCII-7BIT/BINARY encoding
25
- self.new( html_ascii )
26
- end
27
-
28
8
 
29
- attr_reader :html_ascii, ## full "original" 1:1 page in "original/ascii8/binary" encoding
30
- :html, ## utf-8 encoded profile
31
- :html_debug, ## html w/ mapping markers - rename to html_markers - why? why not?
9
+ attr_reader :html_original, ## full "original" 1:1 page
10
+ :html, ## cut-out and sanitized profile
11
+ :html_debug, ## html w/ mapping markers - rename to html_markers - why? why not?
32
12
  :info, ## page info incl. country_name, region_name, last_updated etc.
33
13
  :errors, ## encoding erros etc.
34
14
  :sects
35
15
 
36
16
 
37
- def initialize( html_ascii )
38
- @html_ascii = html_ascii
17
+ def initialize( html_original )
18
+ @html_original = html_original
39
19
 
40
- ## todo/fix: use/assume windows 12xx?? encoding - change encoding to utf-8 (from binary/ascii8bit)
41
- @html, @info, @errors = Sanitizer.new.sanitize( @html_ascii )
20
+ @html, @info, @errors = Sanitizer.new.sanitize( @html_original )
42
21
 
43
22
 
44
23
  html_sects = if @html.empty?
45
24
  ## note: support "empty" pages - old format waiting for update!!!
46
25
  ## cannot parse for now
26
+ @html_debug = ''
47
27
  [] ## return empty (no) sections for now - sorry (its just one page with code cc anyway!!)
48
28
  else
49
29
  @html_debug = map_sects( @html )
@@ -55,7 +35,7 @@ def initialize( html_ascii )
55
35
  pp html_sects
56
36
 
57
37
  ## debug
58
- ## File.open( 'tmp/br.debug.html', 'w:utf-8') { |f| f.write( @html_debug ) }
38
+ ## File.open( 'tmp/br.debug.html', 'w:utf-8') { |f| f.write( @html_debug ) }
59
39
 
60
40
 
61
41
  @sects = []
@@ -101,25 +81,22 @@ def initialize( html_ascii )
101
81
  ## warn/fix: no section title found
102
82
  end
103
83
  end
104
-
105
- self ## return self -- needed?? default (standard) anyway?? check and remove
106
84
  end
107
85
 
108
86
 
87
+ H2_RE = /<h2>
88
+ \s*
89
+ (.+?) ## note: use non-greedy; do NOT allow tags inside for now
90
+ \s*
91
+ <\/h2>
92
+ /xim
109
93
 
110
94
  def map_sects( html )
111
95
  ## convert section titles to "unified" marker
112
96
  ## e.g.
113
97
  ## <h2>Introduction</h2>
114
98
 
115
- title_regex= /<h2>
116
- \s*
117
- (.+?) ## note: use non-greedy; do NOT allow tags inside for now
118
- \s*
119
- <\/h2>
120
- /xim
121
-
122
- html = html.gsub( title_regex ) do |m|
99
+ html = html.gsub( H2_RE ) do |m|
123
100
  puts "** found section >#{$1}<:"
124
101
  puts " >|#{m}|<"
125
102
 
@@ -129,19 +106,19 @@ def map_sects( html )
129
106
  end
130
107
 
131
108
 
109
+ H3_RE = /<h3>
110
+ \s*
111
+ (.+?) ## note: use non-greedy; allows tags inside - why? why not
112
+ \s*
113
+ <\/h3>
114
+ /xim
115
+
132
116
  def map_subsects( html )
133
117
  ## convert subsection titles to "unified" marker
134
118
  ## e.g.
135
119
  ## <h3>Disputes - international:</h3>
136
120
 
137
- title_regex= /<h3>
138
- \s*
139
- (.+?) ## note: use non-greedy; allows tags inside - why? why not
140
- \s*
141
- <\/h3>
142
- /xim
143
-
144
- html = html.gsub( title_regex ) do |m|
121
+ html = html.gsub( H3_RE ) do |m|
145
122
  puts "** found subsection >#{$1}<:"
146
123
  puts " >|#{m}|<"
147
124
 
@@ -163,9 +140,8 @@ def split_sects( html )
163
140
  ## note: "wrap" regex in a capture group (just one)
164
141
  ## String#split will include all catpure groups in the result array
165
142
 
166
- section_regex= /(@SECTION{.+?})/ ## note: use non-greedy -- check: need to escape {} ??
167
-
168
- chunks = html.split( section_regex )
143
+ ## note: use non-greedy -- check: need to escape {} ??
144
+ chunks = html.split( /(@SECTION{.+?})/ )
169
145
 
170
146
  ## check if first item is a section or (html) prolog
171
147
  # if prolog (remove)
@@ -194,9 +170,8 @@ def split_subsects( html )
194
170
  ## note: "wrap" regex in a capture group (just one)
195
171
  ## String#split will include all catpure groups in the result array
196
172
 
197
- subsection_regex= /(@SUBSECTION{.+?})/ ## note: use non-greedy -- check: need to escape {} ??
198
-
199
- chunks = html.split( subsection_regex )
173
+ ## note: use non-greedy -- check: need to escape {} ??
174
+ chunks = html.split( /(@SUBSECTION{.+?})/ )
200
175
 
201
176
  ## check if first item is a section or (html) prolog
202
177
  # if prolog (remove)
@@ -1,25 +1,14 @@
1
- # encoding: utf-8
2
1
 
3
2
  module Factbook
4
3
 
5
4
  ######
6
5
  # json builder -- lets us rebuild a page from "dumped" json (instead of parsing html page)
7
6
 
8
- class JsonBuilder
7
+ class JsonBuilder
9
8
  include LogUtils::Logging
10
9
  include NormalizeHelper ## e.g. normalize_category
11
10
 
12
11
 
13
- def self.from_file( path )
14
- text = File.read( path ) ## fix: use File.read_utf8 from textutils
15
- self.from_string( text )
16
- end
17
-
18
- def self.from_string( text )
19
- self.new( text )
20
- end
21
-
22
-
23
12
  attr_reader :text,
24
13
  :json,
25
14
  :info, ## not used yet -- page info incl. country_name, region_name, last_updated etc.
@@ -29,7 +18,7 @@ attr_reader :text,
29
18
 
30
19
  def initialize( text )
31
20
  @text = text
32
-
21
+
33
22
  @json = JSON.parse( text )
34
23
 
35
24
  @info = nil ## fix/todo: sorry - for now no page info (use header in json - why? why not??)
@@ -40,16 +29,16 @@ def initialize( text )
40
29
  @json.each do |k1,v1|
41
30
  sect_title = k1
42
31
  sect_subsects = v1
43
-
32
+
44
33
  sect = Sect.new
45
34
  sect.title = sect_title
46
-
35
+
47
36
  ## get subsections
48
37
  subsects = []
49
38
  sect_subsects.each do |k2,v2|
50
39
  subsect_title = k2
51
40
  subsect_data = v2
52
-
41
+
53
42
  subsect = Subsect.new
54
43
  subsect.title = subsect_title
55
44
 
@@ -61,13 +50,13 @@ def initialize( text )
61
50
  new_subsect_data[ normalize_category(k3) ] = v3
62
51
  end
63
52
  subsect_data = new_subsect_data
64
- end
65
-
53
+ end
54
+
66
55
  subsect.data = subsect_data
67
-
56
+
68
57
  subsects << subsect
69
58
  end
70
-
59
+
71
60
  sect.subsects = subsects
72
61
  @sects << sect
73
62
  end
@@ -16,7 +16,7 @@ class Codes
16
16
  :region, ## e.g. Europe, Oceans, etc.
17
17
  )
18
18
 
19
- def self.from_csv( path )
19
+ def self.read_csv( path )
20
20
  ###
21
21
  # note:
22
22
  # if you use quotes - NO leading spaces allowed e.g.
@@ -46,9 +46,10 @@ class Codes
46
46
  recs << rec
47
47
  end
48
48
 
49
- self.new( recs )
49
+ new( recs )
50
50
  end
51
51
 
52
+
52
53
  def initialize( codes )
53
54
  @codes = codes
54
55
  end
@@ -9,7 +9,7 @@ class Comparisons
9
9
  :name,
10
10
  )
11
11
 
12
- def self.from_csv( path )
12
+ def self.read_csv( path )
13
13
 
14
14
  rows = CsvHash.read( path )
15
15
 
@@ -27,7 +27,7 @@ class Comparisons
27
27
  recs << rec
28
28
  end
29
29
 
30
- self.new( recs )
30
+ new( recs )
31
31
  end
32
32
 
33
33
  def initialize( comps )
@@ -2,28 +2,6 @@
2
2
  module Factbook
3
3
 
4
4
 
5
- ## note:
6
- ## some factbook pages with chrome (headers, footers, etc.)
7
- ## are NOT valid utf-8, thus,
8
- ## treat page as is (e.g. ASCII8BIT)
9
- #
10
- # only convert to utf8 when header and footer got stripped
11
-
12
- ##
13
- ## be/benin:
14
- ## Key Force or FC [Lazare S?xx?HOU?xx?TO] -- two invalid byte code chars in Political parties and leaders:
15
- #
16
- ## in Western/Windows-1252 leads to FC [Lazare SÈHOUÉTO];
17
- # Lazare Sèhouéto
18
- #
19
- # looks good - use (assume) Windows-1252 ????
20
-
21
- ##
22
- # check for is ascii 7-bit ??? if yes -noworries
23
- # if not, log number of chars not using ascii 7-bit
24
-
25
-
26
-
27
5
  class Page
28
6
  include LogUtils::Logging
29
7
 
@@ -35,52 +13,85 @@ class Page
35
13
  ## standard version (note: requires https)
36
14
  SITE_BASE = 'https://www.cia.gov/library/publications/the-world-factbook/geos/{code}.html'
37
15
 
38
- def initialize( code, opts={} )
39
- ### keep code - why? why not?? (use page_info/info e.g. info.country_code??)
40
16
 
41
- if opts[:json]
42
- json = opts[:json] ## note: json is (still) a string/text (NOT yet parsed to structured data)
43
- b = JsonBuilder.from_string( json )
17
+ def self.parse( html ) ## parse html from string
18
+ new( html: html )
19
+ end
20
+
21
+ def self.read( path )
22
+ html = File.open( path, 'r:utf-8' ) { |f| f.read }
23
+ new( html: html )
24
+ end
25
+
26
+ def self.parse_json( json ) ## parse json from string
27
+ new( json: json )
28
+ end
29
+
30
+ def self.read_json( path )
31
+ json = File.open( path, 'r:utf-8' ) { |f| f.read }
32
+ new( json: json )
33
+ end
34
+
35
+ def self.download( code, cache: false )
36
+ new( code, cache: cache )
37
+ end
38
+
39
+ ## some convenience alias(es)
40
+ class << self
41
+ alias_method :read_html, :read
42
+ alias_method :parse_html, :parse
43
+ end
44
+
45
+
46
+ def initialize( code=nil,
47
+ json: nil,
48
+ html: nil,
49
+ cache: false,
50
+ info: nil )
51
+ if json
52
+ ## note: assumes json is (still) a string/text
53
+ ## (NOT yet parsed to structured data)
54
+ b = JsonBuilder.new( json )
44
55
  else ## assume html
45
- if opts[:html] ## note: expects ASCII-7BIT/BINARY encoding
46
- ## for debugging and testing allow "custom" passed-in html page
47
- html = opts[:html]
56
+ if html
57
+ ## for debugging and testing allow "custom" passed-in html page
48
58
  else
49
- url_string = SITE_BASE.gsub( '{code}', code )
50
- ## note: expects ASCII-7BIT/BINARY encoding
59
+ ## allow passing in code struct too - just use/pluck two-letter code from struct !!!
60
+ code = code.code if code.is_a?( Codes::Code )
51
61
 
52
- ## html = fetch_page( url_string ) ## use PageFetcher class - why?? why not??
53
- html = Webcache.read( url_string )
62
+ raise ArgumentError, "two letter code (e.g. au) required to download page & build page url" if code.nil?
63
+ url = SITE_BASE.sub( '{code}', code )
64
+
65
+ html = if cache && Webcache.exist?( url )
66
+ Webcache.read( url ) ## for debugging - read from cache
67
+ else
68
+ download_page( url )
69
+ end
54
70
  end
55
- b = Builder.from_string( html )
71
+ b = Builder.new( html )
56
72
  end
57
73
 
58
74
  @sects = b.sects
59
75
  @info = b.info
60
76
 
61
77
  ## todo/fix/quick hack:
62
- ## check for info opts hash entry - lets you overwrite page info
78
+ ## check for info opts - lets you overwrite page info
63
79
  ## -- use proper header to setup page info - why, why not??
64
- if opts[:info]
65
- info = opts[:info]
66
- @info = info
67
- end
80
+ @info = info if info
81
+
68
82
 
69
83
  @data = {}
70
84
  @sects.each do |sect|
71
85
  @data[ sect.title ] = sect.data
72
86
  end
73
-
74
- self ## return self (check - not needed??)
75
87
  end
76
88
 
77
89
 
78
- def to_json( opts={} ) ## convenience helper for data.to_json; note: pretty print by default!
79
- if opts[:minify]
90
+ def to_json( minify: false ) ## convenience helper for data.to_json; note: pretty print by default!
91
+ if minify
80
92
  data.to_json
81
- else
82
- ## was: -- opts[:pretty] || opts[:pp]
83
- JSON.pretty_generate( data ) ## note: pretty print by default!
93
+ else ## note: pretty print by default!
94
+ JSON.pretty_generate( data )
84
95
  end
85
96
  end
86
97
 
@@ -96,30 +107,9 @@ class Page
96
107
  data[key]
97
108
  end
98
109
 
99
- ## add convenience (shortcut) accessors / attributes / fields / getters
100
-
101
- ATTRIBUTES.each do |attrib|
102
- ## e.g.
103
- ## def background() data['Introduction']['Background']['text']; end
104
- ## def location() data['Geography']['Location']['text']; end
105
- ## etc.
106
- if attrib.path.size == 1
107
- define_method attrib.name.to_sym do
108
- @data.fetch( attrib.category, {} ).
109
- fetch( attrib.path[0], {} )['text']
110
- end
111
- else ## assume size 2 for now
112
- define_method attrib.name.to_sym do
113
- @data.fetch( attrib.category, {} ).
114
- fetch( attrib.path[0], {} ).
115
- fetch( attrib.path[1], {} )['text']
116
- end
117
- end
118
- end
119
-
120
110
 
121
111
  private
122
- def fetch_page( url )
112
+ def download_page( url )
123
113
  response = Webget.page( url )
124
114
 
125
115
  ## note: exit on get / fetch error - do NOT continue for now - why? why not?
@@ -128,21 +118,5 @@ private
128
118
 
129
119
  response.text
130
120
  end
131
-
132
-
133
- =begin
134
- def self.from_url( cc, cn )
135
- html_ascii = PageFetcher.new.fetch( cc )
136
- self.new( cc, cn, html_ascii )
137
- end
138
-
139
- def self.from_file( cc, cn, opts={} )
140
- input_dir = opts[:input_dir] || '.'
141
- html_ascii = File.read( "#{input_dir}/#{cc}.html" ) ## fix/todo: use ASCII8BIT/binary reader
142
- self.new( cc, cn, html_ascii )
143
- end
144
- =end
145
-
146
-
147
121
  end # class Page
148
122
  end # module Factbook