factbook-readers 0.0.1 → 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Manifest.txt +0 -16
- data/README.md +13 -14
- data/data/codes.csv +1 -1
- data/lib/factbook-readers.rb +11 -12
- data/lib/factbook-readers/builder.rb +28 -53
- data/lib/factbook-readers/builder_json.rb +9 -20
- data/lib/factbook-readers/codes.rb +3 -2
- data/lib/factbook-readers/comparisons.rb +2 -2
- data/lib/factbook-readers/page.rb +59 -85
- data/lib/factbook-readers/sanitizer.rb +13 -34
- data/lib/factbook-readers/version.rb +2 -2
- data/test/helper.rb +1 -5
- data/test/test_builder.rb +1 -6
- data/test/test_codes.rb +5 -9
- data/test/test_comparisons.rb +2 -5
- data/test/test_counter.rb +4 -6
- data/test/test_fields.rb +0 -2
- data/test/test_item_builder.rb +7 -9
- data/test/test_json.rb +1 -3
- data/test/test_json_builder.rb +1 -3
- data/test/test_normalize.rb +0 -2
- data/test/test_page.rb +2 -4
- data/test/test_sanitizer.rb +2 -5
- data/test/test_sanitizer_regex.rb +0 -2
- metadata +2 -18
- data/data/attributes.yml +0 -337
- data/lib/factbook-readers/attributes.rb +0 -74
- data/test/data/au.html +0 -579
- data/test/data/au.yml +0 -8
- data/test/data/be.html +0 -596
- data/test/data/be.yml +0 -8
- data/test/data/json/au.json +0 -892
- data/test/data/src/ag.html +0 -716
- data/test/data/src/au-2015-09-24.html +0 -2006
- data/test/data/src/au.html +0 -658
- data/test/data/src/be-2015-09-24.html +0 -2011
- data/test/data/src/be.html +0 -648
- data/test/test_attribs.rb +0 -87
- data/test/test_attribs_def.rb +0 -20
- data/test/test_convert.rb +0 -30
- data/test/test_importer.rb +0 -56
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d9bc3eaf2cb6fa3774e0b7a25b53336be2b05a55
|
4
|
+
data.tar.gz: 86565dc92913645110beec803d7bc0a7c088155f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 755b8727d0bbcaecd97f52064b1b29321e1b59a72bff55bbdd995ed8968732def7480f4cab0f222bf4c9d163afbd5230647237b96d41dc136006f0f9a9473550
|
7
|
+
data.tar.gz: 638dcf4f4a552c705a743c9e7483e457303d2090a9204ee3d4b390c3256b537050ea4fdc0957cba461dcee549ecf52b541c4b691dcb8c500c4439eaf376d4a87
|
data/Manifest.txt
CHANGED
@@ -2,13 +2,11 @@ CHANGELOG.md
|
|
2
2
|
Manifest.txt
|
3
3
|
README.md
|
4
4
|
Rakefile
|
5
|
-
data/attributes.yml
|
6
5
|
data/categories.csv
|
7
6
|
data/codes.csv
|
8
7
|
data/codesxref.csv
|
9
8
|
data/comparisons.csv
|
10
9
|
lib/factbook-readers.rb
|
11
|
-
lib/factbook-readers/attributes.rb
|
12
10
|
lib/factbook-readers/builder.rb
|
13
11
|
lib/factbook-readers/builder_item.rb
|
14
12
|
lib/factbook-readers/builder_json.rb
|
@@ -27,26 +25,12 @@ lib/factbook-readers/utils.rb
|
|
27
25
|
lib/factbook-readers/utils_info.rb
|
28
26
|
lib/factbook-readers/version.rb
|
29
27
|
lib/factbook/readers.rb
|
30
|
-
test/data/au.html
|
31
|
-
test/data/au.yml
|
32
|
-
test/data/be.html
|
33
|
-
test/data/be.yml
|
34
|
-
test/data/json/au.json
|
35
|
-
test/data/src/ag.html
|
36
|
-
test/data/src/au-2015-09-24.html
|
37
|
-
test/data/src/au.html
|
38
|
-
test/data/src/be-2015-09-24.html
|
39
|
-
test/data/src/be.html
|
40
28
|
test/helper.rb
|
41
|
-
test/test_attribs.rb
|
42
|
-
test/test_attribs_def.rb
|
43
29
|
test/test_builder.rb
|
44
30
|
test/test_codes.rb
|
45
31
|
test/test_comparisons.rb
|
46
|
-
test/test_convert.rb
|
47
32
|
test/test_counter.rb
|
48
33
|
test/test_fields.rb
|
49
|
-
test/test_importer.rb
|
50
34
|
test/test_item_builder.rb
|
51
35
|
test/test_json.rb
|
52
36
|
test/test_json_builder.rb
|
data/README.md
CHANGED
@@ -55,36 +55,35 @@ resulting in:
|
|
55
55
|
...
|
56
56
|
```
|
57
57
|
|
58
|
-
### Use
|
58
|
+
### Use data attributes
|
59
59
|
|
60
60
|
```ruby
|
61
|
-
pp page
|
61
|
+
pp page['Introduction']['Background']['text']
|
62
62
|
# => "Following more than three centuries..."
|
63
|
-
pp page
|
63
|
+
pp page['Geography']['Area']['total']['text']
|
64
64
|
# => "8,515,770 sq km"
|
65
|
-
pp page
|
65
|
+
pp page['Geography']['Area']['land']['text']
|
66
66
|
# => "8,358,140 sq km"
|
67
|
-
pp page
|
67
|
+
pp page['Geography']['Area']['water']['text']
|
68
68
|
# => "157,630 sq km"
|
69
|
-
pp page
|
69
|
+
pp page['Geography']['Area']['note']['text']
|
70
70
|
# => "includes Arquipelago de Fernando de Noronha, Atol das Rocas, ..."
|
71
|
-
pp page
|
71
|
+
pp page['Geography']['Area - comparative']['text']
|
72
72
|
# => "slightly smaller than the US"
|
73
|
-
pp page
|
73
|
+
pp page['Geography']['Climate']['text']
|
74
74
|
# => "mostly tropical, but temperate in south"
|
75
|
-
pp page
|
75
|
+
pp page['Geography']['Terrain']['text']
|
76
76
|
# => "mostly flat to rolling lowlands in north; ..."
|
77
|
-
pp page
|
77
|
+
pp page['Geography']['Elevation extremes']['lowest point']['text']
|
78
78
|
# => "Atlantic Ocean 0 m"
|
79
|
-
pp page
|
79
|
+
pp page['Geography']['Elevation extremes']['highest point']['text']
|
80
80
|
# => "Pico da Neblina 2,994 m"
|
81
|
-
pp page
|
81
|
+
pp page['Geography']['Natural resources']['text']
|
82
82
|
# => "bauxite, gold, iron ore, manganese, nickel, phosphates, ..."
|
83
83
|
...
|
84
84
|
```
|
85
85
|
|
86
|
-
See [
|
87
|
-
See [Attributes](ATTRIBUTES.md) for a quick reference listing.
|
86
|
+
See [Attributes](../ATTRIBUTES.md) for a quick reference listing.
|
88
87
|
|
89
88
|
|
90
89
|
### Save to disk as JSON
|
data/data/codes.csv
CHANGED
@@ -216,7 +216,7 @@ sb,Saint Pierre and Miquelon,Dependencies (France),North America
|
|
216
216
|
wf,Wallis and Futuna,Dependencies (France),Australia-Oceania
|
217
217
|
aa,Aruba,Dependencies (Netherlands),Central America and Caribbean
|
218
218
|
uc,Curacao,Dependencies (Netherlands),Central America and Caribbean
|
219
|
-
|
219
|
+
nn,Sint Maarten,Dependencies (Netherlands),Central America and Caribbean
|
220
220
|
cw,Cook Islands,Dependencies (New Zealand),Australia-Oceania
|
221
221
|
ne,Niue,Dependencies (New Zealand),Australia-Oceania
|
222
222
|
tl,Tokelau,Dependencies (New Zealand),Australia-Oceania
|
data/lib/factbook-readers.rb
CHANGED
@@ -17,22 +17,21 @@ require 'factbook-readers/version' # let it always go first
|
|
17
17
|
|
18
18
|
require 'factbook-readers/codes'
|
19
19
|
require 'factbook-readers/comparisons'
|
20
|
-
require 'factbook-readers/attributes'
|
21
20
|
|
22
|
-
module Factbook
|
23
|
-
|
24
|
-
## auto-load builtin codes, comparisons, attributes, etc.
|
25
|
-
CODES = Codes.from_csv( "#{Factbook::Module::Readers.root}/data/codes.csv" )
|
26
|
-
COMPARISONS = Comparisons.from_csv( "#{Factbook::Module::Readers.root}/data/comparisons.csv" )
|
27
|
-
ATTRIBUTES = Attributes.from_yaml( "#{Factbook::Module::Readers.root}/data/attributes.yml" )
|
28
|
-
|
29
|
-
def self.codes() CODES; end
|
30
|
-
def self.comparisons() COMPARISONS; end
|
31
|
-
def self.attributes() ATTRIBUTES; end
|
32
21
|
|
22
|
+
## note: make codes, comparisons available
|
23
|
+
module Factbook
|
24
|
+
## note: load on demand only builtin codes, comparisons, etc.
|
25
|
+
## for now
|
26
|
+
def self.codes
|
27
|
+
@@codes ||= Codes.read_csv( "#{Factbook::Module::Readers.root}/data/codes.csv" );
|
28
|
+
end
|
29
|
+
def self.comparisons
|
30
|
+
@@comparisons ||= Comparisons.read_csv( "#{Factbook::Module::Readers.root}/data/comparisons.csv" )
|
31
|
+
end
|
33
32
|
end # module Factbook
|
34
33
|
|
35
|
-
|
34
|
+
|
36
35
|
|
37
36
|
require 'factbook-readers/utils'
|
38
37
|
require 'factbook-readers/utils_info'
|
@@ -1,49 +1,29 @@
|
|
1
|
-
# encoding: utf-8
|
2
1
|
|
3
2
|
module Factbook
|
4
3
|
|
5
|
-
class Builder ## todo: change to PageBuilder ???
|
4
|
+
class Builder ## todo: change to HtmlBuilder or PageBuilder ???
|
6
5
|
include LogUtils::Logging
|
7
6
|
|
8
7
|
|
9
|
-
=begin
|
10
|
-
def self.from_cc( cc, opts={} ) ## rename to from_file_for_country() or from_file_for_cc() or something - why?? why not??
|
11
|
-
## check/todo: rename input_dir to just dir or to include ?
|
12
|
-
## (there's no output_dir)?? - why? why not?
|
13
|
-
input_dir = opts[:input_dir] || '.'
|
14
|
-
self.from_file( "#{input_dir}/#{cc}.html" )
|
15
|
-
end
|
16
|
-
=end
|
17
|
-
|
18
|
-
|
19
|
-
def self.from_file( path )
|
20
|
-
html_ascii = File.read( path ) ## fix/todo: use ASCII8BIT/binary reader !!!!!
|
21
|
-
self.from_string( html_ascii )
|
22
|
-
end
|
23
|
-
|
24
|
-
def self.from_string( html_ascii ) ## note: expects ASCII-7BIT/BINARY encoding
|
25
|
-
self.new( html_ascii )
|
26
|
-
end
|
27
|
-
|
28
8
|
|
29
|
-
attr_reader :
|
30
|
-
:html,
|
31
|
-
:html_debug,
|
9
|
+
attr_reader :html_original, ## full "original" 1:1 page
|
10
|
+
:html, ## cut-out and sanitized profile
|
11
|
+
:html_debug, ## html w/ mapping markers - rename to html_markers - why? why not?
|
32
12
|
:info, ## page info incl. country_name, region_name, last_updated etc.
|
33
13
|
:errors, ## encoding erros etc.
|
34
14
|
:sects
|
35
15
|
|
36
16
|
|
37
|
-
def initialize(
|
38
|
-
@
|
17
|
+
def initialize( html_original )
|
18
|
+
@html_original = html_original
|
39
19
|
|
40
|
-
|
41
|
-
@html, @info, @errors = Sanitizer.new.sanitize( @html_ascii )
|
20
|
+
@html, @info, @errors = Sanitizer.new.sanitize( @html_original )
|
42
21
|
|
43
22
|
|
44
23
|
html_sects = if @html.empty?
|
45
24
|
## note: support "empty" pages - old format waiting for update!!!
|
46
25
|
## cannot parse for now
|
26
|
+
@html_debug = ''
|
47
27
|
[] ## return empty (no) sections for now - sorry (its just one page with code cc anyway!!)
|
48
28
|
else
|
49
29
|
@html_debug = map_sects( @html )
|
@@ -55,7 +35,7 @@ def initialize( html_ascii )
|
|
55
35
|
pp html_sects
|
56
36
|
|
57
37
|
## debug
|
58
|
-
##
|
38
|
+
## File.open( 'tmp/br.debug.html', 'w:utf-8') { |f| f.write( @html_debug ) }
|
59
39
|
|
60
40
|
|
61
41
|
@sects = []
|
@@ -101,25 +81,22 @@ def initialize( html_ascii )
|
|
101
81
|
## warn/fix: no section title found
|
102
82
|
end
|
103
83
|
end
|
104
|
-
|
105
|
-
self ## return self -- needed?? default (standard) anyway?? check and remove
|
106
84
|
end
|
107
85
|
|
108
86
|
|
87
|
+
H2_RE = /<h2>
|
88
|
+
\s*
|
89
|
+
(.+?) ## note: use non-greedy; do NOT allow tags inside for now
|
90
|
+
\s*
|
91
|
+
<\/h2>
|
92
|
+
/xim
|
109
93
|
|
110
94
|
def map_sects( html )
|
111
95
|
## convert section titles to "unified" marker
|
112
96
|
## e.g.
|
113
97
|
## <h2>Introduction</h2>
|
114
98
|
|
115
|
-
|
116
|
-
\s*
|
117
|
-
(.+?) ## note: use non-greedy; do NOT allow tags inside for now
|
118
|
-
\s*
|
119
|
-
<\/h2>
|
120
|
-
/xim
|
121
|
-
|
122
|
-
html = html.gsub( title_regex ) do |m|
|
99
|
+
html = html.gsub( H2_RE ) do |m|
|
123
100
|
puts "** found section >#{$1}<:"
|
124
101
|
puts " >|#{m}|<"
|
125
102
|
|
@@ -129,19 +106,19 @@ def map_sects( html )
|
|
129
106
|
end
|
130
107
|
|
131
108
|
|
109
|
+
H3_RE = /<h3>
|
110
|
+
\s*
|
111
|
+
(.+?) ## note: use non-greedy; allows tags inside - why? why not
|
112
|
+
\s*
|
113
|
+
<\/h3>
|
114
|
+
/xim
|
115
|
+
|
132
116
|
def map_subsects( html )
|
133
117
|
## convert subsection titles to "unified" marker
|
134
118
|
## e.g.
|
135
119
|
## <h3>Disputes - international:</h3>
|
136
120
|
|
137
|
-
|
138
|
-
\s*
|
139
|
-
(.+?) ## note: use non-greedy; allows tags inside - why? why not
|
140
|
-
\s*
|
141
|
-
<\/h3>
|
142
|
-
/xim
|
143
|
-
|
144
|
-
html = html.gsub( title_regex ) do |m|
|
121
|
+
html = html.gsub( H3_RE ) do |m|
|
145
122
|
puts "** found subsection >#{$1}<:"
|
146
123
|
puts " >|#{m}|<"
|
147
124
|
|
@@ -163,9 +140,8 @@ def split_sects( html )
|
|
163
140
|
## note: "wrap" regex in a capture group (just one)
|
164
141
|
## String#split will include all catpure groups in the result array
|
165
142
|
|
166
|
-
|
167
|
-
|
168
|
-
chunks = html.split( section_regex )
|
143
|
+
## note: use non-greedy -- check: need to escape {} ??
|
144
|
+
chunks = html.split( /(@SECTION{.+?})/ )
|
169
145
|
|
170
146
|
## check if first item is a section or (html) prolog
|
171
147
|
# if prolog (remove)
|
@@ -194,9 +170,8 @@ def split_subsects( html )
|
|
194
170
|
## note: "wrap" regex in a capture group (just one)
|
195
171
|
## String#split will include all catpure groups in the result array
|
196
172
|
|
197
|
-
|
198
|
-
|
199
|
-
chunks = html.split( subsection_regex )
|
173
|
+
## note: use non-greedy -- check: need to escape {} ??
|
174
|
+
chunks = html.split( /(@SUBSECTION{.+?})/ )
|
200
175
|
|
201
176
|
## check if first item is a section or (html) prolog
|
202
177
|
# if prolog (remove)
|
@@ -1,25 +1,14 @@
|
|
1
|
-
# encoding: utf-8
|
2
1
|
|
3
2
|
module Factbook
|
4
3
|
|
5
4
|
######
|
6
5
|
# json builder -- lets us rebuild a page from "dumped" json (instead of parsing html page)
|
7
6
|
|
8
|
-
class JsonBuilder
|
7
|
+
class JsonBuilder
|
9
8
|
include LogUtils::Logging
|
10
9
|
include NormalizeHelper ## e.g. normalize_category
|
11
10
|
|
12
11
|
|
13
|
-
def self.from_file( path )
|
14
|
-
text = File.read( path ) ## fix: use File.read_utf8 from textutils
|
15
|
-
self.from_string( text )
|
16
|
-
end
|
17
|
-
|
18
|
-
def self.from_string( text )
|
19
|
-
self.new( text )
|
20
|
-
end
|
21
|
-
|
22
|
-
|
23
12
|
attr_reader :text,
|
24
13
|
:json,
|
25
14
|
:info, ## not used yet -- page info incl. country_name, region_name, last_updated etc.
|
@@ -29,7 +18,7 @@ attr_reader :text,
|
|
29
18
|
|
30
19
|
def initialize( text )
|
31
20
|
@text = text
|
32
|
-
|
21
|
+
|
33
22
|
@json = JSON.parse( text )
|
34
23
|
|
35
24
|
@info = nil ## fix/todo: sorry - for now no page info (use header in json - why? why not??)
|
@@ -40,16 +29,16 @@ def initialize( text )
|
|
40
29
|
@json.each do |k1,v1|
|
41
30
|
sect_title = k1
|
42
31
|
sect_subsects = v1
|
43
|
-
|
32
|
+
|
44
33
|
sect = Sect.new
|
45
34
|
sect.title = sect_title
|
46
|
-
|
35
|
+
|
47
36
|
## get subsections
|
48
37
|
subsects = []
|
49
38
|
sect_subsects.each do |k2,v2|
|
50
39
|
subsect_title = k2
|
51
40
|
subsect_data = v2
|
52
|
-
|
41
|
+
|
53
42
|
subsect = Subsect.new
|
54
43
|
subsect.title = subsect_title
|
55
44
|
|
@@ -61,13 +50,13 @@ def initialize( text )
|
|
61
50
|
new_subsect_data[ normalize_category(k3) ] = v3
|
62
51
|
end
|
63
52
|
subsect_data = new_subsect_data
|
64
|
-
end
|
65
|
-
|
53
|
+
end
|
54
|
+
|
66
55
|
subsect.data = subsect_data
|
67
|
-
|
56
|
+
|
68
57
|
subsects << subsect
|
69
58
|
end
|
70
|
-
|
59
|
+
|
71
60
|
sect.subsects = subsects
|
72
61
|
@sects << sect
|
73
62
|
end
|
@@ -16,7 +16,7 @@ class Codes
|
|
16
16
|
:region, ## e.g. Europe, Oceans, etc.
|
17
17
|
)
|
18
18
|
|
19
|
-
def self.
|
19
|
+
def self.read_csv( path )
|
20
20
|
###
|
21
21
|
# note:
|
22
22
|
# if you use quotes - NO leading spaces allowed e.g.
|
@@ -46,9 +46,10 @@ class Codes
|
|
46
46
|
recs << rec
|
47
47
|
end
|
48
48
|
|
49
|
-
|
49
|
+
new( recs )
|
50
50
|
end
|
51
51
|
|
52
|
+
|
52
53
|
def initialize( codes )
|
53
54
|
@codes = codes
|
54
55
|
end
|
@@ -9,7 +9,7 @@ class Comparisons
|
|
9
9
|
:name,
|
10
10
|
)
|
11
11
|
|
12
|
-
def self.
|
12
|
+
def self.read_csv( path )
|
13
13
|
|
14
14
|
rows = CsvHash.read( path )
|
15
15
|
|
@@ -27,7 +27,7 @@ class Comparisons
|
|
27
27
|
recs << rec
|
28
28
|
end
|
29
29
|
|
30
|
-
|
30
|
+
new( recs )
|
31
31
|
end
|
32
32
|
|
33
33
|
def initialize( comps )
|
@@ -2,28 +2,6 @@
|
|
2
2
|
module Factbook
|
3
3
|
|
4
4
|
|
5
|
-
## note:
|
6
|
-
## some factbook pages with chrome (headers, footers, etc.)
|
7
|
-
## are NOT valid utf-8, thus,
|
8
|
-
## treat page as is (e.g. ASCII8BIT)
|
9
|
-
#
|
10
|
-
# only convert to utf8 when header and footer got stripped
|
11
|
-
|
12
|
-
##
|
13
|
-
## be/benin:
|
14
|
-
## Key Force or FC [Lazare S?xx?HOU?xx?TO] -- two invalid byte code chars in Political parties and leaders:
|
15
|
-
#
|
16
|
-
## in Western/Windows-1252 leads to FC [Lazare SÈHOUÉTO];
|
17
|
-
# Lazare Sèhouéto
|
18
|
-
#
|
19
|
-
# looks good - use (assume) Windows-1252 ????
|
20
|
-
|
21
|
-
##
|
22
|
-
# check for is ascii 7-bit ??? if yes -noworries
|
23
|
-
# if not, log number of chars not using ascii 7-bit
|
24
|
-
|
25
|
-
|
26
|
-
|
27
5
|
class Page
|
28
6
|
include LogUtils::Logging
|
29
7
|
|
@@ -35,52 +13,85 @@ class Page
|
|
35
13
|
## standard version (note: requires https)
|
36
14
|
SITE_BASE = 'https://www.cia.gov/library/publications/the-world-factbook/geos/{code}.html'
|
37
15
|
|
38
|
-
def initialize( code, opts={} )
|
39
|
-
### keep code - why? why not?? (use page_info/info e.g. info.country_code??)
|
40
16
|
|
41
|
-
|
42
|
-
|
43
|
-
|
17
|
+
def self.parse( html ) ## parse html from string
|
18
|
+
new( html: html )
|
19
|
+
end
|
20
|
+
|
21
|
+
def self.read( path )
|
22
|
+
html = File.open( path, 'r:utf-8' ) { |f| f.read }
|
23
|
+
new( html: html )
|
24
|
+
end
|
25
|
+
|
26
|
+
def self.parse_json( json ) ## parse json from string
|
27
|
+
new( json: json )
|
28
|
+
end
|
29
|
+
|
30
|
+
def self.read_json( path )
|
31
|
+
json = File.open( path, 'r:utf-8' ) { |f| f.read }
|
32
|
+
new( json: json )
|
33
|
+
end
|
34
|
+
|
35
|
+
def self.download( code, cache: false )
|
36
|
+
new( code, cache: cache )
|
37
|
+
end
|
38
|
+
|
39
|
+
## some convenience alias(es)
|
40
|
+
class << self
|
41
|
+
alias_method :read_html, :read
|
42
|
+
alias_method :parse_html, :parse
|
43
|
+
end
|
44
|
+
|
45
|
+
|
46
|
+
def initialize( code=nil,
|
47
|
+
json: nil,
|
48
|
+
html: nil,
|
49
|
+
cache: false,
|
50
|
+
info: nil )
|
51
|
+
if json
|
52
|
+
## note: assumes json is (still) a string/text
|
53
|
+
## (NOT yet parsed to structured data)
|
54
|
+
b = JsonBuilder.new( json )
|
44
55
|
else ## assume html
|
45
|
-
if
|
46
|
-
|
47
|
-
html = opts[:html]
|
56
|
+
if html
|
57
|
+
## for debugging and testing allow "custom" passed-in html page
|
48
58
|
else
|
49
|
-
|
50
|
-
|
59
|
+
## allow passing in code struct too - just use/pluck two-letter code from struct !!!
|
60
|
+
code = code.code if code.is_a?( Codes::Code )
|
51
61
|
|
52
|
-
|
53
|
-
|
62
|
+
raise ArgumentError, "two letter code (e.g. au) required to download page & build page url" if code.nil?
|
63
|
+
url = SITE_BASE.sub( '{code}', code )
|
64
|
+
|
65
|
+
html = if cache && Webcache.exist?( url )
|
66
|
+
Webcache.read( url ) ## for debugging - read from cache
|
67
|
+
else
|
68
|
+
download_page( url )
|
69
|
+
end
|
54
70
|
end
|
55
|
-
b = Builder.
|
71
|
+
b = Builder.new( html )
|
56
72
|
end
|
57
73
|
|
58
74
|
@sects = b.sects
|
59
75
|
@info = b.info
|
60
76
|
|
61
77
|
## todo/fix/quick hack:
|
62
|
-
## check for info opts
|
78
|
+
## check for info opts - lets you overwrite page info
|
63
79
|
## -- use proper header to setup page info - why, why not??
|
64
|
-
if
|
65
|
-
|
66
|
-
@info = info
|
67
|
-
end
|
80
|
+
@info = info if info
|
81
|
+
|
68
82
|
|
69
83
|
@data = {}
|
70
84
|
@sects.each do |sect|
|
71
85
|
@data[ sect.title ] = sect.data
|
72
86
|
end
|
73
|
-
|
74
|
-
self ## return self (check - not needed??)
|
75
87
|
end
|
76
88
|
|
77
89
|
|
78
|
-
def to_json(
|
79
|
-
if
|
90
|
+
def to_json( minify: false ) ## convenience helper for data.to_json; note: pretty print by default!
|
91
|
+
if minify
|
80
92
|
data.to_json
|
81
|
-
else
|
82
|
-
|
83
|
-
JSON.pretty_generate( data ) ## note: pretty print by default!
|
93
|
+
else ## note: pretty print by default!
|
94
|
+
JSON.pretty_generate( data )
|
84
95
|
end
|
85
96
|
end
|
86
97
|
|
@@ -96,30 +107,9 @@ class Page
|
|
96
107
|
data[key]
|
97
108
|
end
|
98
109
|
|
99
|
-
## add convenience (shortcut) accessors / attributes / fields / getters
|
100
|
-
|
101
|
-
ATTRIBUTES.each do |attrib|
|
102
|
-
## e.g.
|
103
|
-
## def background() data['Introduction']['Background']['text']; end
|
104
|
-
## def location() data['Geography']['Location']['text']; end
|
105
|
-
## etc.
|
106
|
-
if attrib.path.size == 1
|
107
|
-
define_method attrib.name.to_sym do
|
108
|
-
@data.fetch( attrib.category, {} ).
|
109
|
-
fetch( attrib.path[0], {} )['text']
|
110
|
-
end
|
111
|
-
else ## assume size 2 for now
|
112
|
-
define_method attrib.name.to_sym do
|
113
|
-
@data.fetch( attrib.category, {} ).
|
114
|
-
fetch( attrib.path[0], {} ).
|
115
|
-
fetch( attrib.path[1], {} )['text']
|
116
|
-
end
|
117
|
-
end
|
118
|
-
end
|
119
|
-
|
120
110
|
|
121
111
|
private
|
122
|
-
def
|
112
|
+
def download_page( url )
|
123
113
|
response = Webget.page( url )
|
124
114
|
|
125
115
|
## note: exit on get / fetch error - do NOT continue for now - why? why not?
|
@@ -128,21 +118,5 @@ private
|
|
128
118
|
|
129
119
|
response.text
|
130
120
|
end
|
131
|
-
|
132
|
-
|
133
|
-
=begin
|
134
|
-
def self.from_url( cc, cn )
|
135
|
-
html_ascii = PageFetcher.new.fetch( cc )
|
136
|
-
self.new( cc, cn, html_ascii )
|
137
|
-
end
|
138
|
-
|
139
|
-
def self.from_file( cc, cn, opts={} )
|
140
|
-
input_dir = opts[:input_dir] || '.'
|
141
|
-
html_ascii = File.read( "#{input_dir}/#{cc}.html" ) ## fix/todo: use ASCII8BIT/binary reader
|
142
|
-
self.new( cc, cn, html_ascii )
|
143
|
-
end
|
144
|
-
=end
|
145
|
-
|
146
|
-
|
147
121
|
end # class Page
|
148
122
|
end # module Factbook
|