factbook-readers 0.0.1 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Manifest.txt +0 -16
- data/README.md +13 -14
- data/data/codes.csv +1 -1
- data/lib/factbook-readers.rb +11 -12
- data/lib/factbook-readers/builder.rb +28 -53
- data/lib/factbook-readers/builder_json.rb +9 -20
- data/lib/factbook-readers/codes.rb +3 -2
- data/lib/factbook-readers/comparisons.rb +2 -2
- data/lib/factbook-readers/page.rb +59 -85
- data/lib/factbook-readers/sanitizer.rb +13 -34
- data/lib/factbook-readers/version.rb +2 -2
- data/test/helper.rb +1 -5
- data/test/test_builder.rb +1 -6
- data/test/test_codes.rb +5 -9
- data/test/test_comparisons.rb +2 -5
- data/test/test_counter.rb +4 -6
- data/test/test_fields.rb +0 -2
- data/test/test_item_builder.rb +7 -9
- data/test/test_json.rb +1 -3
- data/test/test_json_builder.rb +1 -3
- data/test/test_normalize.rb +0 -2
- data/test/test_page.rb +2 -4
- data/test/test_sanitizer.rb +2 -5
- data/test/test_sanitizer_regex.rb +0 -2
- metadata +2 -18
- data/data/attributes.yml +0 -337
- data/lib/factbook-readers/attributes.rb +0 -74
- data/test/data/au.html +0 -579
- data/test/data/au.yml +0 -8
- data/test/data/be.html +0 -596
- data/test/data/be.yml +0 -8
- data/test/data/json/au.json +0 -892
- data/test/data/src/ag.html +0 -716
- data/test/data/src/au-2015-09-24.html +0 -2006
- data/test/data/src/au.html +0 -658
- data/test/data/src/be-2015-09-24.html +0 -2011
- data/test/data/src/be.html +0 -648
- data/test/test_attribs.rb +0 -87
- data/test/test_attribs_def.rb +0 -20
- data/test/test_convert.rb +0 -30
- data/test/test_importer.rb +0 -56
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d9bc3eaf2cb6fa3774e0b7a25b53336be2b05a55
|
4
|
+
data.tar.gz: 86565dc92913645110beec803d7bc0a7c088155f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 755b8727d0bbcaecd97f52064b1b29321e1b59a72bff55bbdd995ed8968732def7480f4cab0f222bf4c9d163afbd5230647237b96d41dc136006f0f9a9473550
|
7
|
+
data.tar.gz: 638dcf4f4a552c705a743c9e7483e457303d2090a9204ee3d4b390c3256b537050ea4fdc0957cba461dcee549ecf52b541c4b691dcb8c500c4439eaf376d4a87
|
data/Manifest.txt
CHANGED
@@ -2,13 +2,11 @@ CHANGELOG.md
|
|
2
2
|
Manifest.txt
|
3
3
|
README.md
|
4
4
|
Rakefile
|
5
|
-
data/attributes.yml
|
6
5
|
data/categories.csv
|
7
6
|
data/codes.csv
|
8
7
|
data/codesxref.csv
|
9
8
|
data/comparisons.csv
|
10
9
|
lib/factbook-readers.rb
|
11
|
-
lib/factbook-readers/attributes.rb
|
12
10
|
lib/factbook-readers/builder.rb
|
13
11
|
lib/factbook-readers/builder_item.rb
|
14
12
|
lib/factbook-readers/builder_json.rb
|
@@ -27,26 +25,12 @@ lib/factbook-readers/utils.rb
|
|
27
25
|
lib/factbook-readers/utils_info.rb
|
28
26
|
lib/factbook-readers/version.rb
|
29
27
|
lib/factbook/readers.rb
|
30
|
-
test/data/au.html
|
31
|
-
test/data/au.yml
|
32
|
-
test/data/be.html
|
33
|
-
test/data/be.yml
|
34
|
-
test/data/json/au.json
|
35
|
-
test/data/src/ag.html
|
36
|
-
test/data/src/au-2015-09-24.html
|
37
|
-
test/data/src/au.html
|
38
|
-
test/data/src/be-2015-09-24.html
|
39
|
-
test/data/src/be.html
|
40
28
|
test/helper.rb
|
41
|
-
test/test_attribs.rb
|
42
|
-
test/test_attribs_def.rb
|
43
29
|
test/test_builder.rb
|
44
30
|
test/test_codes.rb
|
45
31
|
test/test_comparisons.rb
|
46
|
-
test/test_convert.rb
|
47
32
|
test/test_counter.rb
|
48
33
|
test/test_fields.rb
|
49
|
-
test/test_importer.rb
|
50
34
|
test/test_item_builder.rb
|
51
35
|
test/test_json.rb
|
52
36
|
test/test_json_builder.rb
|
data/README.md
CHANGED
@@ -55,36 +55,35 @@ resulting in:
|
|
55
55
|
...
|
56
56
|
```
|
57
57
|
|
58
|
-
### Use
|
58
|
+
### Use data attributes
|
59
59
|
|
60
60
|
```ruby
|
61
|
-
pp page
|
61
|
+
pp page['Introduction']['Background']['text']
|
62
62
|
# => "Following more than three centuries..."
|
63
|
-
pp page
|
63
|
+
pp page['Geography']['Area']['total']['text']
|
64
64
|
# => "8,515,770 sq km"
|
65
|
-
pp page
|
65
|
+
pp page['Geography']['Area']['land']['text']
|
66
66
|
# => "8,358,140 sq km"
|
67
|
-
pp page
|
67
|
+
pp page['Geography']['Area']['water']['text']
|
68
68
|
# => "157,630 sq km"
|
69
|
-
pp page
|
69
|
+
pp page['Geography']['Area']['note']['text']
|
70
70
|
# => "includes Arquipelago de Fernando de Noronha, Atol das Rocas, ..."
|
71
|
-
pp page
|
71
|
+
pp page['Geography']['Area - comparative']['text']
|
72
72
|
# => "slightly smaller than the US"
|
73
|
-
pp page
|
73
|
+
pp page['Geography']['Climate']['text']
|
74
74
|
# => "mostly tropical, but temperate in south"
|
75
|
-
pp page
|
75
|
+
pp page['Geography']['Terrain']['text']
|
76
76
|
# => "mostly flat to rolling lowlands in north; ..."
|
77
|
-
pp page
|
77
|
+
pp page['Geography']['Elevation extremes']['lowest point']['text']
|
78
78
|
# => "Atlantic Ocean 0 m"
|
79
|
-
pp page
|
79
|
+
pp page['Geography']['Elevation extremes']['highest point']['text']
|
80
80
|
# => "Pico da Neblina 2,994 m"
|
81
|
-
pp page
|
81
|
+
pp page['Geography']['Natural resources']['text']
|
82
82
|
# => "bauxite, gold, iron ore, manganese, nickel, phosphates, ..."
|
83
83
|
...
|
84
84
|
```
|
85
85
|
|
86
|
-
See [
|
87
|
-
See [Attributes](ATTRIBUTES.md) for a quick reference listing.
|
86
|
+
See [Attributes](../ATTRIBUTES.md) for a quick reference listing.
|
88
87
|
|
89
88
|
|
90
89
|
### Save to disk as JSON
|
data/data/codes.csv
CHANGED
@@ -216,7 +216,7 @@ sb,Saint Pierre and Miquelon,Dependencies (France),North America
|
|
216
216
|
wf,Wallis and Futuna,Dependencies (France),Australia-Oceania
|
217
217
|
aa,Aruba,Dependencies (Netherlands),Central America and Caribbean
|
218
218
|
uc,Curacao,Dependencies (Netherlands),Central America and Caribbean
|
219
|
-
|
219
|
+
nn,Sint Maarten,Dependencies (Netherlands),Central America and Caribbean
|
220
220
|
cw,Cook Islands,Dependencies (New Zealand),Australia-Oceania
|
221
221
|
ne,Niue,Dependencies (New Zealand),Australia-Oceania
|
222
222
|
tl,Tokelau,Dependencies (New Zealand),Australia-Oceania
|
data/lib/factbook-readers.rb
CHANGED
@@ -17,22 +17,21 @@ require 'factbook-readers/version' # let it always go first
|
|
17
17
|
|
18
18
|
require 'factbook-readers/codes'
|
19
19
|
require 'factbook-readers/comparisons'
|
20
|
-
require 'factbook-readers/attributes'
|
21
20
|
|
22
|
-
module Factbook
|
23
|
-
|
24
|
-
## auto-load builtin codes, comparisons, attributes, etc.
|
25
|
-
CODES = Codes.from_csv( "#{Factbook::Module::Readers.root}/data/codes.csv" )
|
26
|
-
COMPARISONS = Comparisons.from_csv( "#{Factbook::Module::Readers.root}/data/comparisons.csv" )
|
27
|
-
ATTRIBUTES = Attributes.from_yaml( "#{Factbook::Module::Readers.root}/data/attributes.yml" )
|
28
|
-
|
29
|
-
def self.codes() CODES; end
|
30
|
-
def self.comparisons() COMPARISONS; end
|
31
|
-
def self.attributes() ATTRIBUTES; end
|
32
21
|
|
22
|
+
## note: make codes, comparisons available
|
23
|
+
module Factbook
|
24
|
+
## note: load on demand only builtin codes, comparisons, etc.
|
25
|
+
## for now
|
26
|
+
def self.codes
|
27
|
+
@@codes ||= Codes.read_csv( "#{Factbook::Module::Readers.root}/data/codes.csv" );
|
28
|
+
end
|
29
|
+
def self.comparisons
|
30
|
+
@@comparisons ||= Comparisons.read_csv( "#{Factbook::Module::Readers.root}/data/comparisons.csv" )
|
31
|
+
end
|
33
32
|
end # module Factbook
|
34
33
|
|
35
|
-
|
34
|
+
|
36
35
|
|
37
36
|
require 'factbook-readers/utils'
|
38
37
|
require 'factbook-readers/utils_info'
|
@@ -1,49 +1,29 @@
|
|
1
|
-
# encoding: utf-8
|
2
1
|
|
3
2
|
module Factbook
|
4
3
|
|
5
|
-
class Builder ## todo: change to PageBuilder ???
|
4
|
+
class Builder ## todo: change to HtmlBuilder or PageBuilder ???
|
6
5
|
include LogUtils::Logging
|
7
6
|
|
8
7
|
|
9
|
-
=begin
|
10
|
-
def self.from_cc( cc, opts={} ) ## rename to from_file_for_country() or from_file_for_cc() or something - why?? why not??
|
11
|
-
## check/todo: rename input_dir to just dir or to include ?
|
12
|
-
## (there's no output_dir)?? - why? why not?
|
13
|
-
input_dir = opts[:input_dir] || '.'
|
14
|
-
self.from_file( "#{input_dir}/#{cc}.html" )
|
15
|
-
end
|
16
|
-
=end
|
17
|
-
|
18
|
-
|
19
|
-
def self.from_file( path )
|
20
|
-
html_ascii = File.read( path ) ## fix/todo: use ASCII8BIT/binary reader !!!!!
|
21
|
-
self.from_string( html_ascii )
|
22
|
-
end
|
23
|
-
|
24
|
-
def self.from_string( html_ascii ) ## note: expects ASCII-7BIT/BINARY encoding
|
25
|
-
self.new( html_ascii )
|
26
|
-
end
|
27
|
-
|
28
8
|
|
29
|
-
attr_reader :
|
30
|
-
:html,
|
31
|
-
:html_debug,
|
9
|
+
attr_reader :html_original, ## full "original" 1:1 page
|
10
|
+
:html, ## cut-out and sanitized profile
|
11
|
+
:html_debug, ## html w/ mapping markers - rename to html_markers - why? why not?
|
32
12
|
:info, ## page info incl. country_name, region_name, last_updated etc.
|
33
13
|
:errors, ## encoding erros etc.
|
34
14
|
:sects
|
35
15
|
|
36
16
|
|
37
|
-
def initialize(
|
38
|
-
@
|
17
|
+
def initialize( html_original )
|
18
|
+
@html_original = html_original
|
39
19
|
|
40
|
-
|
41
|
-
@html, @info, @errors = Sanitizer.new.sanitize( @html_ascii )
|
20
|
+
@html, @info, @errors = Sanitizer.new.sanitize( @html_original )
|
42
21
|
|
43
22
|
|
44
23
|
html_sects = if @html.empty?
|
45
24
|
## note: support "empty" pages - old format waiting for update!!!
|
46
25
|
## cannot parse for now
|
26
|
+
@html_debug = ''
|
47
27
|
[] ## return empty (no) sections for now - sorry (its just one page with code cc anyway!!)
|
48
28
|
else
|
49
29
|
@html_debug = map_sects( @html )
|
@@ -55,7 +35,7 @@ def initialize( html_ascii )
|
|
55
35
|
pp html_sects
|
56
36
|
|
57
37
|
## debug
|
58
|
-
##
|
38
|
+
## File.open( 'tmp/br.debug.html', 'w:utf-8') { |f| f.write( @html_debug ) }
|
59
39
|
|
60
40
|
|
61
41
|
@sects = []
|
@@ -101,25 +81,22 @@ def initialize( html_ascii )
|
|
101
81
|
## warn/fix: no section title found
|
102
82
|
end
|
103
83
|
end
|
104
|
-
|
105
|
-
self ## return self -- needed?? default (standard) anyway?? check and remove
|
106
84
|
end
|
107
85
|
|
108
86
|
|
87
|
+
H2_RE = /<h2>
|
88
|
+
\s*
|
89
|
+
(.+?) ## note: use non-greedy; do NOT allow tags inside for now
|
90
|
+
\s*
|
91
|
+
<\/h2>
|
92
|
+
/xim
|
109
93
|
|
110
94
|
def map_sects( html )
|
111
95
|
## convert section titles to "unified" marker
|
112
96
|
## e.g.
|
113
97
|
## <h2>Introduction</h2>
|
114
98
|
|
115
|
-
|
116
|
-
\s*
|
117
|
-
(.+?) ## note: use non-greedy; do NOT allow tags inside for now
|
118
|
-
\s*
|
119
|
-
<\/h2>
|
120
|
-
/xim
|
121
|
-
|
122
|
-
html = html.gsub( title_regex ) do |m|
|
99
|
+
html = html.gsub( H2_RE ) do |m|
|
123
100
|
puts "** found section >#{$1}<:"
|
124
101
|
puts " >|#{m}|<"
|
125
102
|
|
@@ -129,19 +106,19 @@ def map_sects( html )
|
|
129
106
|
end
|
130
107
|
|
131
108
|
|
109
|
+
H3_RE = /<h3>
|
110
|
+
\s*
|
111
|
+
(.+?) ## note: use non-greedy; allows tags inside - why? why not
|
112
|
+
\s*
|
113
|
+
<\/h3>
|
114
|
+
/xim
|
115
|
+
|
132
116
|
def map_subsects( html )
|
133
117
|
## convert subsection titles to "unified" marker
|
134
118
|
## e.g.
|
135
119
|
## <h3>Disputes - international:</h3>
|
136
120
|
|
137
|
-
|
138
|
-
\s*
|
139
|
-
(.+?) ## note: use non-greedy; allows tags inside - why? why not
|
140
|
-
\s*
|
141
|
-
<\/h3>
|
142
|
-
/xim
|
143
|
-
|
144
|
-
html = html.gsub( title_regex ) do |m|
|
121
|
+
html = html.gsub( H3_RE ) do |m|
|
145
122
|
puts "** found subsection >#{$1}<:"
|
146
123
|
puts " >|#{m}|<"
|
147
124
|
|
@@ -163,9 +140,8 @@ def split_sects( html )
|
|
163
140
|
## note: "wrap" regex in a capture group (just one)
|
164
141
|
## String#split will include all catpure groups in the result array
|
165
142
|
|
166
|
-
|
167
|
-
|
168
|
-
chunks = html.split( section_regex )
|
143
|
+
## note: use non-greedy -- check: need to escape {} ??
|
144
|
+
chunks = html.split( /(@SECTION{.+?})/ )
|
169
145
|
|
170
146
|
## check if first item is a section or (html) prolog
|
171
147
|
# if prolog (remove)
|
@@ -194,9 +170,8 @@ def split_subsects( html )
|
|
194
170
|
## note: "wrap" regex in a capture group (just one)
|
195
171
|
## String#split will include all catpure groups in the result array
|
196
172
|
|
197
|
-
|
198
|
-
|
199
|
-
chunks = html.split( subsection_regex )
|
173
|
+
## note: use non-greedy -- check: need to escape {} ??
|
174
|
+
chunks = html.split( /(@SUBSECTION{.+?})/ )
|
200
175
|
|
201
176
|
## check if first item is a section or (html) prolog
|
202
177
|
# if prolog (remove)
|
@@ -1,25 +1,14 @@
|
|
1
|
-
# encoding: utf-8
|
2
1
|
|
3
2
|
module Factbook
|
4
3
|
|
5
4
|
######
|
6
5
|
# json builder -- lets us rebuild a page from "dumped" json (instead of parsing html page)
|
7
6
|
|
8
|
-
class JsonBuilder
|
7
|
+
class JsonBuilder
|
9
8
|
include LogUtils::Logging
|
10
9
|
include NormalizeHelper ## e.g. normalize_category
|
11
10
|
|
12
11
|
|
13
|
-
def self.from_file( path )
|
14
|
-
text = File.read( path ) ## fix: use File.read_utf8 from textutils
|
15
|
-
self.from_string( text )
|
16
|
-
end
|
17
|
-
|
18
|
-
def self.from_string( text )
|
19
|
-
self.new( text )
|
20
|
-
end
|
21
|
-
|
22
|
-
|
23
12
|
attr_reader :text,
|
24
13
|
:json,
|
25
14
|
:info, ## not used yet -- page info incl. country_name, region_name, last_updated etc.
|
@@ -29,7 +18,7 @@ attr_reader :text,
|
|
29
18
|
|
30
19
|
def initialize( text )
|
31
20
|
@text = text
|
32
|
-
|
21
|
+
|
33
22
|
@json = JSON.parse( text )
|
34
23
|
|
35
24
|
@info = nil ## fix/todo: sorry - for now no page info (use header in json - why? why not??)
|
@@ -40,16 +29,16 @@ def initialize( text )
|
|
40
29
|
@json.each do |k1,v1|
|
41
30
|
sect_title = k1
|
42
31
|
sect_subsects = v1
|
43
|
-
|
32
|
+
|
44
33
|
sect = Sect.new
|
45
34
|
sect.title = sect_title
|
46
|
-
|
35
|
+
|
47
36
|
## get subsections
|
48
37
|
subsects = []
|
49
38
|
sect_subsects.each do |k2,v2|
|
50
39
|
subsect_title = k2
|
51
40
|
subsect_data = v2
|
52
|
-
|
41
|
+
|
53
42
|
subsect = Subsect.new
|
54
43
|
subsect.title = subsect_title
|
55
44
|
|
@@ -61,13 +50,13 @@ def initialize( text )
|
|
61
50
|
new_subsect_data[ normalize_category(k3) ] = v3
|
62
51
|
end
|
63
52
|
subsect_data = new_subsect_data
|
64
|
-
end
|
65
|
-
|
53
|
+
end
|
54
|
+
|
66
55
|
subsect.data = subsect_data
|
67
|
-
|
56
|
+
|
68
57
|
subsects << subsect
|
69
58
|
end
|
70
|
-
|
59
|
+
|
71
60
|
sect.subsects = subsects
|
72
61
|
@sects << sect
|
73
62
|
end
|
@@ -16,7 +16,7 @@ class Codes
|
|
16
16
|
:region, ## e.g. Europe, Oceans, etc.
|
17
17
|
)
|
18
18
|
|
19
|
-
def self.
|
19
|
+
def self.read_csv( path )
|
20
20
|
###
|
21
21
|
# note:
|
22
22
|
# if you use quotes - NO leading spaces allowed e.g.
|
@@ -46,9 +46,10 @@ class Codes
|
|
46
46
|
recs << rec
|
47
47
|
end
|
48
48
|
|
49
|
-
|
49
|
+
new( recs )
|
50
50
|
end
|
51
51
|
|
52
|
+
|
52
53
|
def initialize( codes )
|
53
54
|
@codes = codes
|
54
55
|
end
|
@@ -9,7 +9,7 @@ class Comparisons
|
|
9
9
|
:name,
|
10
10
|
)
|
11
11
|
|
12
|
-
def self.
|
12
|
+
def self.read_csv( path )
|
13
13
|
|
14
14
|
rows = CsvHash.read( path )
|
15
15
|
|
@@ -27,7 +27,7 @@ class Comparisons
|
|
27
27
|
recs << rec
|
28
28
|
end
|
29
29
|
|
30
|
-
|
30
|
+
new( recs )
|
31
31
|
end
|
32
32
|
|
33
33
|
def initialize( comps )
|
@@ -2,28 +2,6 @@
|
|
2
2
|
module Factbook
|
3
3
|
|
4
4
|
|
5
|
-
## note:
|
6
|
-
## some factbook pages with chrome (headers, footers, etc.)
|
7
|
-
## are NOT valid utf-8, thus,
|
8
|
-
## treat page as is (e.g. ASCII8BIT)
|
9
|
-
#
|
10
|
-
# only convert to utf8 when header and footer got stripped
|
11
|
-
|
12
|
-
##
|
13
|
-
## be/benin:
|
14
|
-
## Key Force or FC [Lazare S?xx?HOU?xx?TO] -- two invalid byte code chars in Political parties and leaders:
|
15
|
-
#
|
16
|
-
## in Western/Windows-1252 leads to FC [Lazare SÈHOUÉTO];
|
17
|
-
# Lazare Sèhouéto
|
18
|
-
#
|
19
|
-
# looks good - use (assume) Windows-1252 ????
|
20
|
-
|
21
|
-
##
|
22
|
-
# check for is ascii 7-bit ??? if yes -noworries
|
23
|
-
# if not, log number of chars not using ascii 7-bit
|
24
|
-
|
25
|
-
|
26
|
-
|
27
5
|
class Page
|
28
6
|
include LogUtils::Logging
|
29
7
|
|
@@ -35,52 +13,85 @@ class Page
|
|
35
13
|
## standard version (note: requires https)
|
36
14
|
SITE_BASE = 'https://www.cia.gov/library/publications/the-world-factbook/geos/{code}.html'
|
37
15
|
|
38
|
-
def initialize( code, opts={} )
|
39
|
-
### keep code - why? why not?? (use page_info/info e.g. info.country_code??)
|
40
16
|
|
41
|
-
|
42
|
-
|
43
|
-
|
17
|
+
def self.parse( html ) ## parse html from string
|
18
|
+
new( html: html )
|
19
|
+
end
|
20
|
+
|
21
|
+
def self.read( path )
|
22
|
+
html = File.open( path, 'r:utf-8' ) { |f| f.read }
|
23
|
+
new( html: html )
|
24
|
+
end
|
25
|
+
|
26
|
+
def self.parse_json( json ) ## parse json from string
|
27
|
+
new( json: json )
|
28
|
+
end
|
29
|
+
|
30
|
+
def self.read_json( path )
|
31
|
+
json = File.open( path, 'r:utf-8' ) { |f| f.read }
|
32
|
+
new( json: json )
|
33
|
+
end
|
34
|
+
|
35
|
+
def self.download( code, cache: false )
|
36
|
+
new( code, cache: cache )
|
37
|
+
end
|
38
|
+
|
39
|
+
## some convenience alias(es)
|
40
|
+
class << self
|
41
|
+
alias_method :read_html, :read
|
42
|
+
alias_method :parse_html, :parse
|
43
|
+
end
|
44
|
+
|
45
|
+
|
46
|
+
def initialize( code=nil,
|
47
|
+
json: nil,
|
48
|
+
html: nil,
|
49
|
+
cache: false,
|
50
|
+
info: nil )
|
51
|
+
if json
|
52
|
+
## note: assumes json is (still) a string/text
|
53
|
+
## (NOT yet parsed to structured data)
|
54
|
+
b = JsonBuilder.new( json )
|
44
55
|
else ## assume html
|
45
|
-
if
|
46
|
-
|
47
|
-
html = opts[:html]
|
56
|
+
if html
|
57
|
+
## for debugging and testing allow "custom" passed-in html page
|
48
58
|
else
|
49
|
-
|
50
|
-
|
59
|
+
## allow passing in code struct too - just use/pluck two-letter code from struct !!!
|
60
|
+
code = code.code if code.is_a?( Codes::Code )
|
51
61
|
|
52
|
-
|
53
|
-
|
62
|
+
raise ArgumentError, "two letter code (e.g. au) required to download page & build page url" if code.nil?
|
63
|
+
url = SITE_BASE.sub( '{code}', code )
|
64
|
+
|
65
|
+
html = if cache && Webcache.exist?( url )
|
66
|
+
Webcache.read( url ) ## for debugging - read from cache
|
67
|
+
else
|
68
|
+
download_page( url )
|
69
|
+
end
|
54
70
|
end
|
55
|
-
b = Builder.
|
71
|
+
b = Builder.new( html )
|
56
72
|
end
|
57
73
|
|
58
74
|
@sects = b.sects
|
59
75
|
@info = b.info
|
60
76
|
|
61
77
|
## todo/fix/quick hack:
|
62
|
-
## check for info opts
|
78
|
+
## check for info opts - lets you overwrite page info
|
63
79
|
## -- use proper header to setup page info - why, why not??
|
64
|
-
if
|
65
|
-
|
66
|
-
@info = info
|
67
|
-
end
|
80
|
+
@info = info if info
|
81
|
+
|
68
82
|
|
69
83
|
@data = {}
|
70
84
|
@sects.each do |sect|
|
71
85
|
@data[ sect.title ] = sect.data
|
72
86
|
end
|
73
|
-
|
74
|
-
self ## return self (check - not needed??)
|
75
87
|
end
|
76
88
|
|
77
89
|
|
78
|
-
def to_json(
|
79
|
-
if
|
90
|
+
def to_json( minify: false ) ## convenience helper for data.to_json; note: pretty print by default!
|
91
|
+
if minify
|
80
92
|
data.to_json
|
81
|
-
else
|
82
|
-
|
83
|
-
JSON.pretty_generate( data ) ## note: pretty print by default!
|
93
|
+
else ## note: pretty print by default!
|
94
|
+
JSON.pretty_generate( data )
|
84
95
|
end
|
85
96
|
end
|
86
97
|
|
@@ -96,30 +107,9 @@ class Page
|
|
96
107
|
data[key]
|
97
108
|
end
|
98
109
|
|
99
|
-
## add convenience (shortcut) accessors / attributes / fields / getters
|
100
|
-
|
101
|
-
ATTRIBUTES.each do |attrib|
|
102
|
-
## e.g.
|
103
|
-
## def background() data['Introduction']['Background']['text']; end
|
104
|
-
## def location() data['Geography']['Location']['text']; end
|
105
|
-
## etc.
|
106
|
-
if attrib.path.size == 1
|
107
|
-
define_method attrib.name.to_sym do
|
108
|
-
@data.fetch( attrib.category, {} ).
|
109
|
-
fetch( attrib.path[0], {} )['text']
|
110
|
-
end
|
111
|
-
else ## assume size 2 for now
|
112
|
-
define_method attrib.name.to_sym do
|
113
|
-
@data.fetch( attrib.category, {} ).
|
114
|
-
fetch( attrib.path[0], {} ).
|
115
|
-
fetch( attrib.path[1], {} )['text']
|
116
|
-
end
|
117
|
-
end
|
118
|
-
end
|
119
|
-
|
120
110
|
|
121
111
|
private
|
122
|
-
def
|
112
|
+
def download_page( url )
|
123
113
|
response = Webget.page( url )
|
124
114
|
|
125
115
|
## note: exit on get / fetch error - do NOT continue for now - why? why not?
|
@@ -128,21 +118,5 @@ private
|
|
128
118
|
|
129
119
|
response.text
|
130
120
|
end
|
131
|
-
|
132
|
-
|
133
|
-
=begin
|
134
|
-
def self.from_url( cc, cn )
|
135
|
-
html_ascii = PageFetcher.new.fetch( cc )
|
136
|
-
self.new( cc, cn, html_ascii )
|
137
|
-
end
|
138
|
-
|
139
|
-
def self.from_file( cc, cn, opts={} )
|
140
|
-
input_dir = opts[:input_dir] || '.'
|
141
|
-
html_ascii = File.read( "#{input_dir}/#{cc}.html" ) ## fix/todo: use ASCII8BIT/binary reader
|
142
|
-
self.new( cc, cn, html_ascii )
|
143
|
-
end
|
144
|
-
=end
|
145
|
-
|
146
|
-
|
147
121
|
end # class Page
|
148
122
|
end # module Factbook
|