factbook-readers 0.0.1 → 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: dc307d415f957d373118337b72baa5ca9c0b5686
4
- data.tar.gz: f5241909514a895878e29b1e9e3dd0f3fddf9088
3
+ metadata.gz: d9bc3eaf2cb6fa3774e0b7a25b53336be2b05a55
4
+ data.tar.gz: 86565dc92913645110beec803d7bc0a7c088155f
5
5
  SHA512:
6
- metadata.gz: cd89c3f31089bb3256969076a69fe2dcc3e04f762033cec14e621d5bbff289f72aecd4f6a67e0c32b864588c56f3a57e8059f9957f885293875a95695dd22059
7
- data.tar.gz: 5f55a31397cbfa186cb85597ec280a2fc94fe63e8b595ccf18e36e2168dff531e19d0a64a63281fba380356df23b56769318aecf1d84c52277e511e5cb21998f
6
+ metadata.gz: 755b8727d0bbcaecd97f52064b1b29321e1b59a72bff55bbdd995ed8968732def7480f4cab0f222bf4c9d163afbd5230647237b96d41dc136006f0f9a9473550
7
+ data.tar.gz: 638dcf4f4a552c705a743c9e7483e457303d2090a9204ee3d4b390c3256b537050ea4fdc0957cba461dcee549ecf52b541c4b691dcb8c500c4439eaf376d4a87
@@ -2,13 +2,11 @@ CHANGELOG.md
2
2
  Manifest.txt
3
3
  README.md
4
4
  Rakefile
5
- data/attributes.yml
6
5
  data/categories.csv
7
6
  data/codes.csv
8
7
  data/codesxref.csv
9
8
  data/comparisons.csv
10
9
  lib/factbook-readers.rb
11
- lib/factbook-readers/attributes.rb
12
10
  lib/factbook-readers/builder.rb
13
11
  lib/factbook-readers/builder_item.rb
14
12
  lib/factbook-readers/builder_json.rb
@@ -27,26 +25,12 @@ lib/factbook-readers/utils.rb
27
25
  lib/factbook-readers/utils_info.rb
28
26
  lib/factbook-readers/version.rb
29
27
  lib/factbook/readers.rb
30
- test/data/au.html
31
- test/data/au.yml
32
- test/data/be.html
33
- test/data/be.yml
34
- test/data/json/au.json
35
- test/data/src/ag.html
36
- test/data/src/au-2015-09-24.html
37
- test/data/src/au.html
38
- test/data/src/be-2015-09-24.html
39
- test/data/src/be.html
40
28
  test/helper.rb
41
- test/test_attribs.rb
42
- test/test_attribs_def.rb
43
29
  test/test_builder.rb
44
30
  test/test_codes.rb
45
31
  test/test_comparisons.rb
46
- test/test_convert.rb
47
32
  test/test_counter.rb
48
33
  test/test_fields.rb
49
- test/test_importer.rb
50
34
  test/test_item_builder.rb
51
35
  test/test_json.rb
52
36
  test/test_json_builder.rb
data/README.md CHANGED
@@ -55,36 +55,35 @@ resulting in:
55
55
  ...
56
56
  ```
57
57
 
58
- ### Use shortcut attribute accessors
58
+ ### Use data attributes
59
59
 
60
60
  ```ruby
61
- pp page.background ## same as page['Introduction']['Background']['text']
61
+ pp page['Introduction']['Background']['text']
62
62
  # => "Following more than three centuries..."
63
- pp page.area ## same as page['Geography'][''Area']['total']['text']
63
+ pp page['Geography']['Area']['total']['text']
64
64
  # => "8,515,770 sq km"
65
- pp page.area_land ## same as page['Geography'][''Area']['land']['text']
65
+ pp page['Geography']['Area']['land']['text']
66
66
  # => "8,358,140 sq km"
67
- pp page.area_water ## same as page['Geography'][''Area']['water']['text']
67
+ pp page['Geography']['Area']['water']['text']
68
68
  # => "157,630 sq km"
69
- pp page.area_note ## same as page['Geography'][''Area']['note']['text']
69
+ pp page['Geography']['Area']['note']['text']
70
70
  # => "includes Arquipelago de Fernando de Noronha, Atol das Rocas, ..."
71
- pp page.area_comparative ## same as page['Geography']['Area - comparative']['text']
71
+ pp page['Geography']['Area - comparative']['text']
72
72
  # => "slightly smaller than the US"
73
- pp page.climate ## same as page['Geography']['Climate']['text']
73
+ pp page['Geography']['Climate']['text']
74
74
  # => "mostly tropical, but temperate in south"
75
- pp page.terrain ## same as page['Geography']['Terrain']['text']
75
+ pp page['Geography']['Terrain']['text']
76
76
  # => "mostly flat to rolling lowlands in north; ..."
77
- pp page.elevation_lowest ## same as page['Geography']['Elevation extremes']['lowest point']['text']
77
+ pp page['Geography']['Elevation extremes']['lowest point']['text']
78
78
  # => "Atlantic Ocean 0 m"
79
- pp page.elevation_highest ## same as page['Geography']['Elevation extremes']['highest point']['text']
79
+ pp page['Geography']['Elevation extremes']['highest point']['text']
80
80
  # => "Pico da Neblina 2,994 m"
81
- pp page.resources ## same as page['Geography'][Natural resources']['text']
81
+ pp page['Geography']['Natural resources']['text']
82
82
  # => "bauxite, gold, iron ore, manganese, nickel, phosphates, ..."
83
83
  ...
84
84
  ```
85
85
 
86
- See [`data/attributes.yml`](data/attributes.yml) for the full listing of all built-in attribute shortcut accessors.
87
- See [Attributes](ATTRIBUTES.md) for a quick reference listing.
86
+ See [Attributes](../ATTRIBUTES.md) for a quick reference listing.
88
87
 
89
88
 
90
89
  ### Save to disk as JSON
@@ -216,7 +216,7 @@ sb,Saint Pierre and Miquelon,Dependencies (France),North America
216
216
  wf,Wallis and Futuna,Dependencies (France),Australia-Oceania
217
217
  aa,Aruba,Dependencies (Netherlands),Central America and Caribbean
218
218
  uc,Curacao,Dependencies (Netherlands),Central America and Caribbean
219
- sk,Sint Maarten,Dependencies (Netherlands),Central America and Caribbean
219
+ nn,Sint Maarten,Dependencies (Netherlands),Central America and Caribbean
220
220
  cw,Cook Islands,Dependencies (New Zealand),Australia-Oceania
221
221
  ne,Niue,Dependencies (New Zealand),Australia-Oceania
222
222
  tl,Tokelau,Dependencies (New Zealand),Australia-Oceania
@@ -17,22 +17,21 @@ require 'factbook-readers/version' # let it always go first
17
17
 
18
18
  require 'factbook-readers/codes'
19
19
  require 'factbook-readers/comparisons'
20
- require 'factbook-readers/attributes'
21
20
 
22
- module Factbook
23
-
24
- ## auto-load builtin codes, comparisons, attributes, etc.
25
- CODES = Codes.from_csv( "#{Factbook::Module::Readers.root}/data/codes.csv" )
26
- COMPARISONS = Comparisons.from_csv( "#{Factbook::Module::Readers.root}/data/comparisons.csv" )
27
- ATTRIBUTES = Attributes.from_yaml( "#{Factbook::Module::Readers.root}/data/attributes.yml" )
28
-
29
- def self.codes() CODES; end
30
- def self.comparisons() COMPARISONS; end
31
- def self.attributes() ATTRIBUTES; end
32
21
 
22
+ ## note: make codes, comparisons available
23
+ module Factbook
24
+ ## note: load on demand only builtin codes, comparisons, etc.
25
+ ## for now
26
+ def self.codes
27
+ @@codes ||= Codes.read_csv( "#{Factbook::Module::Readers.root}/data/codes.csv" );
28
+ end
29
+ def self.comparisons
30
+ @@comparisons ||= Comparisons.read_csv( "#{Factbook::Module::Readers.root}/data/comparisons.csv" )
31
+ end
33
32
  end # module Factbook
34
33
 
35
- ## note: make codes, comparisons, attributes available
34
+
36
35
 
37
36
  require 'factbook-readers/utils'
38
37
  require 'factbook-readers/utils_info'
@@ -1,49 +1,29 @@
1
- # encoding: utf-8
2
1
 
3
2
  module Factbook
4
3
 
5
- class Builder ## todo: change to PageBuilder ???
4
+ class Builder ## todo: change to HtmlBuilder or PageBuilder ???
6
5
  include LogUtils::Logging
7
6
 
8
7
 
9
- =begin
10
- def self.from_cc( cc, opts={} ) ## rename to from_file_for_country() or from_file_for_cc() or something - why?? why not??
11
- ## check/todo: rename input_dir to just dir or to include ?
12
- ## (there's no output_dir)?? - why? why not?
13
- input_dir = opts[:input_dir] || '.'
14
- self.from_file( "#{input_dir}/#{cc}.html" )
15
- end
16
- =end
17
-
18
-
19
- def self.from_file( path )
20
- html_ascii = File.read( path ) ## fix/todo: use ASCII8BIT/binary reader !!!!!
21
- self.from_string( html_ascii )
22
- end
23
-
24
- def self.from_string( html_ascii ) ## note: expects ASCII-7BIT/BINARY encoding
25
- self.new( html_ascii )
26
- end
27
-
28
8
 
29
- attr_reader :html_ascii, ## full "original" 1:1 page in "original/ascii8/binary" encoding
30
- :html, ## utf-8 encoded profile
31
- :html_debug, ## html w/ mapping markers - rename to html_markers - why? why not?
9
+ attr_reader :html_original, ## full "original" 1:1 page
10
+ :html, ## cut-out and sanitized profile
11
+ :html_debug, ## html w/ mapping markers - rename to html_markers - why? why not?
32
12
  :info, ## page info incl. country_name, region_name, last_updated etc.
33
13
  :errors, ## encoding erros etc.
34
14
  :sects
35
15
 
36
16
 
37
- def initialize( html_ascii )
38
- @html_ascii = html_ascii
17
+ def initialize( html_original )
18
+ @html_original = html_original
39
19
 
40
- ## todo/fix: use/assume windows 12xx?? encoding - change encoding to utf-8 (from binary/ascii8bit)
41
- @html, @info, @errors = Sanitizer.new.sanitize( @html_ascii )
20
+ @html, @info, @errors = Sanitizer.new.sanitize( @html_original )
42
21
 
43
22
 
44
23
  html_sects = if @html.empty?
45
24
  ## note: support "empty" pages - old format waiting for update!!!
46
25
  ## cannot parse for now
26
+ @html_debug = ''
47
27
  [] ## return empty (no) sections for now - sorry (its just one page with code cc anyway!!)
48
28
  else
49
29
  @html_debug = map_sects( @html )
@@ -55,7 +35,7 @@ def initialize( html_ascii )
55
35
  pp html_sects
56
36
 
57
37
  ## debug
58
- ## File.open( 'tmp/br.debug.html', 'w:utf-8') { |f| f.write( @html_debug ) }
38
+ ## File.open( 'tmp/br.debug.html', 'w:utf-8') { |f| f.write( @html_debug ) }
59
39
 
60
40
 
61
41
  @sects = []
@@ -101,25 +81,22 @@ def initialize( html_ascii )
101
81
  ## warn/fix: no section title found
102
82
  end
103
83
  end
104
-
105
- self ## return self -- needed?? default (standard) anyway?? check and remove
106
84
  end
107
85
 
108
86
 
87
+ H2_RE = /<h2>
88
+ \s*
89
+ (.+?) ## note: use non-greedy; do NOT allow tags inside for now
90
+ \s*
91
+ <\/h2>
92
+ /xim
109
93
 
110
94
  def map_sects( html )
111
95
  ## convert section titles to "unified" marker
112
96
  ## e.g.
113
97
  ## <h2>Introduction</h2>
114
98
 
115
- title_regex= /<h2>
116
- \s*
117
- (.+?) ## note: use non-greedy; do NOT allow tags inside for now
118
- \s*
119
- <\/h2>
120
- /xim
121
-
122
- html = html.gsub( title_regex ) do |m|
99
+ html = html.gsub( H2_RE ) do |m|
123
100
  puts "** found section >#{$1}<:"
124
101
  puts " >|#{m}|<"
125
102
 
@@ -129,19 +106,19 @@ def map_sects( html )
129
106
  end
130
107
 
131
108
 
109
+ H3_RE = /<h3>
110
+ \s*
111
+ (.+?) ## note: use non-greedy; allows tags inside - why? why not
112
+ \s*
113
+ <\/h3>
114
+ /xim
115
+
132
116
  def map_subsects( html )
133
117
  ## convert subsection titles to "unified" marker
134
118
  ## e.g.
135
119
  ## <h3>Disputes - international:</h3>
136
120
 
137
- title_regex= /<h3>
138
- \s*
139
- (.+?) ## note: use non-greedy; allows tags inside - why? why not
140
- \s*
141
- <\/h3>
142
- /xim
143
-
144
- html = html.gsub( title_regex ) do |m|
121
+ html = html.gsub( H3_RE ) do |m|
145
122
  puts "** found subsection >#{$1}<:"
146
123
  puts " >|#{m}|<"
147
124
 
@@ -163,9 +140,8 @@ def split_sects( html )
163
140
  ## note: "wrap" regex in a capture group (just one)
164
141
  ## String#split will include all catpure groups in the result array
165
142
 
166
- section_regex= /(@SECTION{.+?})/ ## note: use non-greedy -- check: need to escape {} ??
167
-
168
- chunks = html.split( section_regex )
143
+ ## note: use non-greedy -- check: need to escape {} ??
144
+ chunks = html.split( /(@SECTION{.+?})/ )
169
145
 
170
146
  ## check if first item is a section or (html) prolog
171
147
  # if prolog (remove)
@@ -194,9 +170,8 @@ def split_subsects( html )
194
170
  ## note: "wrap" regex in a capture group (just one)
195
171
  ## String#split will include all catpure groups in the result array
196
172
 
197
- subsection_regex= /(@SUBSECTION{.+?})/ ## note: use non-greedy -- check: need to escape {} ??
198
-
199
- chunks = html.split( subsection_regex )
173
+ ## note: use non-greedy -- check: need to escape {} ??
174
+ chunks = html.split( /(@SUBSECTION{.+?})/ )
200
175
 
201
176
  ## check if first item is a section or (html) prolog
202
177
  # if prolog (remove)
@@ -1,25 +1,14 @@
1
- # encoding: utf-8
2
1
 
3
2
  module Factbook
4
3
 
5
4
  ######
6
5
  # json builder -- lets us rebuild a page from "dumped" json (instead of parsing html page)
7
6
 
8
- class JsonBuilder
7
+ class JsonBuilder
9
8
  include LogUtils::Logging
10
9
  include NormalizeHelper ## e.g. normalize_category
11
10
 
12
11
 
13
- def self.from_file( path )
14
- text = File.read( path ) ## fix: use File.read_utf8 from textutils
15
- self.from_string( text )
16
- end
17
-
18
- def self.from_string( text )
19
- self.new( text )
20
- end
21
-
22
-
23
12
  attr_reader :text,
24
13
  :json,
25
14
  :info, ## not used yet -- page info incl. country_name, region_name, last_updated etc.
@@ -29,7 +18,7 @@ attr_reader :text,
29
18
 
30
19
  def initialize( text )
31
20
  @text = text
32
-
21
+
33
22
  @json = JSON.parse( text )
34
23
 
35
24
  @info = nil ## fix/todo: sorry - for now no page info (use header in json - why? why not??)
@@ -40,16 +29,16 @@ def initialize( text )
40
29
  @json.each do |k1,v1|
41
30
  sect_title = k1
42
31
  sect_subsects = v1
43
-
32
+
44
33
  sect = Sect.new
45
34
  sect.title = sect_title
46
-
35
+
47
36
  ## get subsections
48
37
  subsects = []
49
38
  sect_subsects.each do |k2,v2|
50
39
  subsect_title = k2
51
40
  subsect_data = v2
52
-
41
+
53
42
  subsect = Subsect.new
54
43
  subsect.title = subsect_title
55
44
 
@@ -61,13 +50,13 @@ def initialize( text )
61
50
  new_subsect_data[ normalize_category(k3) ] = v3
62
51
  end
63
52
  subsect_data = new_subsect_data
64
- end
65
-
53
+ end
54
+
66
55
  subsect.data = subsect_data
67
-
56
+
68
57
  subsects << subsect
69
58
  end
70
-
59
+
71
60
  sect.subsects = subsects
72
61
  @sects << sect
73
62
  end
@@ -16,7 +16,7 @@ class Codes
16
16
  :region, ## e.g. Europe, Oceans, etc.
17
17
  )
18
18
 
19
- def self.from_csv( path )
19
+ def self.read_csv( path )
20
20
  ###
21
21
  # note:
22
22
  # if you use quotes - NO leading spaces allowed e.g.
@@ -46,9 +46,10 @@ class Codes
46
46
  recs << rec
47
47
  end
48
48
 
49
- self.new( recs )
49
+ new( recs )
50
50
  end
51
51
 
52
+
52
53
  def initialize( codes )
53
54
  @codes = codes
54
55
  end
@@ -9,7 +9,7 @@ class Comparisons
9
9
  :name,
10
10
  )
11
11
 
12
- def self.from_csv( path )
12
+ def self.read_csv( path )
13
13
 
14
14
  rows = CsvHash.read( path )
15
15
 
@@ -27,7 +27,7 @@ class Comparisons
27
27
  recs << rec
28
28
  end
29
29
 
30
- self.new( recs )
30
+ new( recs )
31
31
  end
32
32
 
33
33
  def initialize( comps )
@@ -2,28 +2,6 @@
2
2
  module Factbook
3
3
 
4
4
 
5
- ## note:
6
- ## some factbook pages with chrome (headers, footers, etc.)
7
- ## are NOT valid utf-8, thus,
8
- ## treat page as is (e.g. ASCII8BIT)
9
- #
10
- # only convert to utf8 when header and footer got stripped
11
-
12
- ##
13
- ## be/benin:
14
- ## Key Force or FC [Lazare S?xx?HOU?xx?TO] -- two invalid byte code chars in Political parties and leaders:
15
- #
16
- ## in Western/Windows-1252 leads to FC [Lazare SÈHOUÉTO];
17
- # Lazare Sèhouéto
18
- #
19
- # looks good - use (assume) Windows-1252 ????
20
-
21
- ##
22
- # check for is ascii 7-bit ??? if yes -noworries
23
- # if not, log number of chars not using ascii 7-bit
24
-
25
-
26
-
27
5
  class Page
28
6
  include LogUtils::Logging
29
7
 
@@ -35,52 +13,85 @@ class Page
35
13
  ## standard version (note: requires https)
36
14
  SITE_BASE = 'https://www.cia.gov/library/publications/the-world-factbook/geos/{code}.html'
37
15
 
38
- def initialize( code, opts={} )
39
- ### keep code - why? why not?? (use page_info/info e.g. info.country_code??)
40
16
 
41
- if opts[:json]
42
- json = opts[:json] ## note: json is (still) a string/text (NOT yet parsed to structured data)
43
- b = JsonBuilder.from_string( json )
17
+ def self.parse( html ) ## parse html from string
18
+ new( html: html )
19
+ end
20
+
21
+ def self.read( path )
22
+ html = File.open( path, 'r:utf-8' ) { |f| f.read }
23
+ new( html: html )
24
+ end
25
+
26
+ def self.parse_json( json ) ## parse json from string
27
+ new( json: json )
28
+ end
29
+
30
+ def self.read_json( path )
31
+ json = File.open( path, 'r:utf-8' ) { |f| f.read }
32
+ new( json: json )
33
+ end
34
+
35
+ def self.download( code, cache: false )
36
+ new( code, cache: cache )
37
+ end
38
+
39
+ ## some convenience alias(es)
40
+ class << self
41
+ alias_method :read_html, :read
42
+ alias_method :parse_html, :parse
43
+ end
44
+
45
+
46
+ def initialize( code=nil,
47
+ json: nil,
48
+ html: nil,
49
+ cache: false,
50
+ info: nil )
51
+ if json
52
+ ## note: assumes json is (still) a string/text
53
+ ## (NOT yet parsed to structured data)
54
+ b = JsonBuilder.new( json )
44
55
  else ## assume html
45
- if opts[:html] ## note: expects ASCII-7BIT/BINARY encoding
46
- ## for debugging and testing allow "custom" passed-in html page
47
- html = opts[:html]
56
+ if html
57
+ ## for debugging and testing allow "custom" passed-in html page
48
58
  else
49
- url_string = SITE_BASE.gsub( '{code}', code )
50
- ## note: expects ASCII-7BIT/BINARY encoding
59
+ ## allow passing in code struct too - just use/pluck two-letter code from struct !!!
60
+ code = code.code if code.is_a?( Codes::Code )
51
61
 
52
- ## html = fetch_page( url_string ) ## use PageFetcher class - why?? why not??
53
- html = Webcache.read( url_string )
62
+ raise ArgumentError, "two letter code (e.g. au) required to download page & build page url" if code.nil?
63
+ url = SITE_BASE.sub( '{code}', code )
64
+
65
+ html = if cache && Webcache.exist?( url )
66
+ Webcache.read( url ) ## for debugging - read from cache
67
+ else
68
+ download_page( url )
69
+ end
54
70
  end
55
- b = Builder.from_string( html )
71
+ b = Builder.new( html )
56
72
  end
57
73
 
58
74
  @sects = b.sects
59
75
  @info = b.info
60
76
 
61
77
  ## todo/fix/quick hack:
62
- ## check for info opts hash entry - lets you overwrite page info
78
+ ## check for info opts - lets you overwrite page info
63
79
  ## -- use proper header to setup page info - why, why not??
64
- if opts[:info]
65
- info = opts[:info]
66
- @info = info
67
- end
80
+ @info = info if info
81
+
68
82
 
69
83
  @data = {}
70
84
  @sects.each do |sect|
71
85
  @data[ sect.title ] = sect.data
72
86
  end
73
-
74
- self ## return self (check - not needed??)
75
87
  end
76
88
 
77
89
 
78
- def to_json( opts={} ) ## convenience helper for data.to_json; note: pretty print by default!
79
- if opts[:minify]
90
+ def to_json( minify: false ) ## convenience helper for data.to_json; note: pretty print by default!
91
+ if minify
80
92
  data.to_json
81
- else
82
- ## was: -- opts[:pretty] || opts[:pp]
83
- JSON.pretty_generate( data ) ## note: pretty print by default!
93
+ else ## note: pretty print by default!
94
+ JSON.pretty_generate( data )
84
95
  end
85
96
  end
86
97
 
@@ -96,30 +107,9 @@ class Page
96
107
  data[key]
97
108
  end
98
109
 
99
- ## add convenience (shortcut) accessors / attributes / fields / getters
100
-
101
- ATTRIBUTES.each do |attrib|
102
- ## e.g.
103
- ## def background() data['Introduction']['Background']['text']; end
104
- ## def location() data['Geography']['Location']['text']; end
105
- ## etc.
106
- if attrib.path.size == 1
107
- define_method attrib.name.to_sym do
108
- @data.fetch( attrib.category, {} ).
109
- fetch( attrib.path[0], {} )['text']
110
- end
111
- else ## assume size 2 for now
112
- define_method attrib.name.to_sym do
113
- @data.fetch( attrib.category, {} ).
114
- fetch( attrib.path[0], {} ).
115
- fetch( attrib.path[1], {} )['text']
116
- end
117
- end
118
- end
119
-
120
110
 
121
111
  private
122
- def fetch_page( url )
112
+ def download_page( url )
123
113
  response = Webget.page( url )
124
114
 
125
115
  ## note: exit on get / fetch error - do NOT continue for now - why? why not?
@@ -128,21 +118,5 @@ private
128
118
 
129
119
  response.text
130
120
  end
131
-
132
-
133
- =begin
134
- def self.from_url( cc, cn )
135
- html_ascii = PageFetcher.new.fetch( cc )
136
- self.new( cc, cn, html_ascii )
137
- end
138
-
139
- def self.from_file( cc, cn, opts={} )
140
- input_dir = opts[:input_dir] || '.'
141
- html_ascii = File.read( "#{input_dir}/#{cc}.html" ) ## fix/todo: use ASCII8BIT/binary reader
142
- self.new( cc, cn, html_ascii )
143
- end
144
- =end
145
-
146
-
147
121
  end # class Page
148
122
  end # module Factbook