factbook-readers 1.0.1 → 1.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/Manifest.txt +3 -25
- data/README.md +11 -69
- data/Rakefile +3 -3
- data/lib/factbook-readers.rb +5 -40
- data/lib/factbook-readers/convert.rb +37 -0
- data/lib/factbook-readers/counter.rb +7 -9
- data/lib/factbook-readers/page.rb +41 -61
- data/lib/factbook-readers/page_info.rb +15 -3
- data/lib/factbook-readers/version.rb +2 -2
- data/test/helper.rb +3 -0
- data/test/test_counter.rb +9 -6
- data/test/test_download.rb +27 -0
- data/test/test_fields.rb +44 -27
- data/test/test_json.rb +4 -4
- data/test/test_page.rb +8 -8
- data/test/test_version.rb +15 -0
- metadata +11 -48
- data/data/categories.csv +0 -164
- data/data/codes.csv +0 -262
- data/data/codesxref.csv +0 -280
- data/data/comparisons.csv +0 -75
- data/lib/factbook-readers/builder.rb +0 -187
- data/lib/factbook-readers/builder_item.rb +0 -201
- data/lib/factbook-readers/builder_json.rb +0 -68
- data/lib/factbook-readers/codes.rb +0 -121
- data/lib/factbook-readers/comparisons.rb +0 -49
- data/lib/factbook-readers/normalize.rb +0 -42
- data/lib/factbook-readers/reader_json.rb +0 -50
- data/lib/factbook-readers/sanitizer.rb +0 -351
- data/lib/factbook-readers/sect.rb +0 -28
- data/lib/factbook-readers/subsect.rb +0 -17
- data/lib/factbook-readers/table.rb +0 -51
- data/lib/factbook-readers/utils.rb +0 -47
- data/lib/factbook-readers/utils_info.rb +0 -128
- data/test/test_builder.rb +0 -30
- data/test/test_codes.rb +0 -72
- data/test/test_comparisons.rb +0 -16
- data/test/test_item_builder.rb +0 -97
- data/test/test_json_builder.rb +0 -23
- data/test/test_normalize.rb +0 -21
- data/test/test_sanitizer.rb +0 -36
- data/test/test_sanitizer_regex.rb +0 -87
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: e5d0dc182771764b690661c69fdc905c4197b0a47e22f8a06903087719a3bb11
|
4
|
+
data.tar.gz: 7e995f86a4a1a9bb307914caff0ebb2ce39b0d2fafb62e8b1d4790608d7d6c5c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: b640ed837c55588df3f27381d86aeec14677d54d47236cf486ced33952a2619341fb396d79e280a2aa93c01600151a6ea3b0baa6a6d66c7c5ad6115479a72bbd
|
7
|
+
data.tar.gz: 147439c2077725912bf0ede44bb30376112abc1ec7810f9f17d46643cbc055e46ff0e2051e07098a284ff4c1990edd0051246cbd04eebd05b0603a72f1f1d128
|
data/Manifest.txt
CHANGED
@@ -2,39 +2,17 @@ CHANGELOG.md
|
|
2
2
|
Manifest.txt
|
3
3
|
README.md
|
4
4
|
Rakefile
|
5
|
-
data/categories.csv
|
6
|
-
data/codes.csv
|
7
|
-
data/codesxref.csv
|
8
|
-
data/comparisons.csv
|
9
5
|
lib/factbook-readers.rb
|
10
|
-
lib/factbook-readers/
|
11
|
-
lib/factbook-readers/builder_item.rb
|
12
|
-
lib/factbook-readers/builder_json.rb
|
13
|
-
lib/factbook-readers/codes.rb
|
14
|
-
lib/factbook-readers/comparisons.rb
|
6
|
+
lib/factbook-readers/convert.rb
|
15
7
|
lib/factbook-readers/counter.rb
|
16
|
-
lib/factbook-readers/normalize.rb
|
17
8
|
lib/factbook-readers/page.rb
|
18
9
|
lib/factbook-readers/page_info.rb
|
19
|
-
lib/factbook-readers/reader_json.rb
|
20
|
-
lib/factbook-readers/sanitizer.rb
|
21
|
-
lib/factbook-readers/sect.rb
|
22
|
-
lib/factbook-readers/subsect.rb
|
23
|
-
lib/factbook-readers/table.rb
|
24
|
-
lib/factbook-readers/utils.rb
|
25
|
-
lib/factbook-readers/utils_info.rb
|
26
10
|
lib/factbook-readers/version.rb
|
27
11
|
lib/factbook/readers.rb
|
28
12
|
test/helper.rb
|
29
|
-
test/test_builder.rb
|
30
|
-
test/test_codes.rb
|
31
|
-
test/test_comparisons.rb
|
32
13
|
test/test_counter.rb
|
14
|
+
test/test_download.rb
|
33
15
|
test/test_fields.rb
|
34
|
-
test/test_item_builder.rb
|
35
16
|
test/test_json.rb
|
36
|
-
test/test_json_builder.rb
|
37
|
-
test/test_normalize.rb
|
38
17
|
test/test_page.rb
|
39
|
-
test/
|
40
|
-
test/test_sanitizer_regex.rb
|
18
|
+
test/test_version.rb
|
data/README.md
CHANGED
@@ -7,6 +7,11 @@
|
|
7
7
|
* forum :: [groups.google.com/group/openmundi](https://groups.google.com/group/openmundi)
|
8
8
|
|
9
9
|
|
10
|
+
## What's the World Factbook?
|
11
|
+
|
12
|
+
See [factbook/factbook.json »](https://github.com/factbook/factbook.json)
|
13
|
+
|
14
|
+
|
10
15
|
|
11
16
|
## Usage
|
12
17
|
|
@@ -14,7 +19,7 @@
|
|
14
19
|
|
15
20
|
```ruby
|
16
21
|
page = Factbook::Page.new( 'br' ) # br is the country code for Brazil
|
17
|
-
pp page.
|
22
|
+
pp page.to_h # pretty print data hash
|
18
23
|
```
|
19
24
|
|
20
25
|
resulting in:
|
@@ -94,78 +99,15 @@ end
|
|
94
99
|
```
|
95
100
|
|
96
101
|
|
97
|
-
### List all codes
|
98
|
-
|
99
|
-
```ruby
|
100
|
-
Factbook.codes.each do |code|
|
101
|
-
pp code
|
102
|
-
end
|
103
|
-
```
|
104
|
-
|
105
|
-
resulting in:
|
106
|
-
|
107
|
-
```
|
108
|
-
#<struct Factbook::Codes::Code
|
109
|
-
code ="af",
|
110
|
-
name ="Afghanistan",
|
111
|
-
category="Countries",
|
112
|
-
region ="South Asia">
|
113
|
-
#<struct Factbook::Codes::Code
|
114
|
-
code ="al",
|
115
|
-
name ="Albania",
|
116
|
-
category="Countries",
|
117
|
-
region ="Europe">
|
118
|
-
#<struct Factbook::Codes::Code
|
119
|
-
code ="ag",
|
120
|
-
name ="Algeria",
|
121
|
-
category="Countries",
|
122
|
-
region ="Africa">
|
123
|
-
#<struct Factbook::Codes::Code
|
124
|
-
code ="an",
|
125
|
-
name ="Andorra",
|
126
|
-
category="Countries",
|
127
|
-
region ="Europe">
|
128
|
-
...
|
129
|
-
```
|
130
|
-
|
131
|
-
Note: You can filter codes by category e.g. Countries, Dependencies, Miscellaneous, Oceans, etc.
|
132
|
-
and/or by region e.g. Africa, Europe, South Asia, Central America and Caribbean, etc.
|
133
102
|
|
134
103
|
|
135
|
-
```ruby
|
136
|
-
|
137
|
-
assert_equal 261, Factbook.codes.size
|
138
|
-
|
139
|
-
## categories
|
140
|
-
assert_equal 195, Factbook.codes.countries.size
|
141
|
-
assert_equal 52, Factbook.codes.dependencies.size
|
142
|
-
assert_equal 5, Factbook.codes.oceans.size
|
143
|
-
assert_equal 1, Factbook.codes.world.size
|
144
|
-
assert_equal 2, Factbook.codes.others.size
|
145
|
-
assert_equal 6, Factbook.codes.misc.size
|
146
|
-
|
147
|
-
## regions
|
148
|
-
assert_equal 55, Factbook.codes.europe.size
|
149
|
-
assert_equal 9, Factbook.codes.south_asia.size
|
150
|
-
assert_equal 6, Factbook.codes.central_asia.size
|
151
|
-
assert_equal 22, Factbook.codes.east_n_souteast_asia.size
|
152
|
-
assert_equal 19, Factbook.codes.middle_east.size
|
153
|
-
assert_equal 56, Factbook.codes.africa.size
|
154
|
-
assert_equal 7, Factbook.codes.north_america.size
|
155
|
-
assert_equal 33, Factbook.codes.central_america_n_caribbean.size
|
156
|
-
assert_equal 14, Factbook.codes.south_america.size
|
157
|
-
assert_equal 30, Factbook.codes.australia_oceania.size
|
158
|
-
assert_equal 4, Factbook.codes.antartica.size
|
159
|
-
assert_equal 5, Factbook.codes.region('Oceans').size
|
160
|
-
assert_equal 1, Factbook.codes.region('World').size
|
161
|
-
|
162
|
-
## categories + regions
|
163
|
-
assert_equal 45, Factbook.codes.countries.europe.size
|
164
|
-
...
|
165
|
-
```
|
166
104
|
|
167
|
-
|
105
|
+
## Ready-To-Use Public Domain (Free) Factbook Datasets
|
168
106
|
|
107
|
+
[factbook/factbook.json](https://github.com/factbook/factbook.json) - open (public domain)
|
108
|
+
factbook country profiles in JSON for all the world's countries (note: using the original
|
109
|
+
/ official two-letter GEC (formerly FIPS) codes and NOT the ISO codes - you might be used to for country codes e.g. Austria is `au.json` and NOT `at.json`,
|
110
|
+
Germany is `gm.json` and NOT `de.json` so on)
|
169
111
|
|
170
112
|
|
171
113
|
|
data/Rakefile
CHANGED
@@ -18,10 +18,10 @@ Hoe.spec 'factbook-readers' do
|
|
18
18
|
self.history_file = 'CHANGELOG.md'
|
19
19
|
|
20
20
|
self.extra_deps = [
|
21
|
-
['
|
22
|
-
['
|
21
|
+
['factbook-codes' ],
|
22
|
+
['factbook-fields' ],
|
23
23
|
['webget'],
|
24
|
-
['nokogiri'],
|
24
|
+
## ['nokogiri'],
|
25
25
|
## ['activerecord'] # note: will include activesupport,etc.
|
26
26
|
]
|
27
27
|
|
data/lib/factbook-readers.rb
CHANGED
@@ -1,13 +1,8 @@
|
|
1
|
-
|
2
|
-
## require 'props'
|
1
|
+
require 'factbook-fields'
|
3
2
|
|
4
|
-
|
3
|
+
## more 3rd party gems/libs
|
4
|
+
## require 'props'
|
5
5
|
require 'webget'
|
6
|
-
require 'csvreader'
|
7
|
-
|
8
|
-
|
9
|
-
require 'nokogiri'
|
10
|
-
|
11
6
|
|
12
7
|
|
13
8
|
|
@@ -15,40 +10,10 @@ require 'nokogiri'
|
|
15
10
|
require 'factbook-readers/version' # let it always go first
|
16
11
|
|
17
12
|
|
18
|
-
require 'factbook-readers/codes'
|
19
|
-
require 'factbook-readers/comparisons'
|
20
|
-
|
21
|
-
|
22
|
-
## note: make codes, comparisons available
|
23
|
-
module Factbook
|
24
|
-
## note: load on demand only builtin codes, comparisons, etc.
|
25
|
-
## for now
|
26
|
-
def self.codes
|
27
|
-
@@codes ||= Codes.read_csv( "#{Factbook::Module::Readers.root}/data/codes.csv" );
|
28
|
-
end
|
29
|
-
def self.comparisons
|
30
|
-
@@comparisons ||= Comparisons.read_csv( "#{Factbook::Module::Readers.root}/data/comparisons.csv" )
|
31
|
-
end
|
32
|
-
end # module Factbook
|
33
13
|
|
34
|
-
|
35
|
-
|
36
|
-
require 'factbook-readers/utils'
|
37
|
-
require 'factbook-readers/utils_info'
|
38
|
-
require 'factbook-readers/sanitizer'
|
39
|
-
require 'factbook-readers/normalize'
|
40
|
-
require 'factbook-readers/builder_item'
|
41
|
-
require 'factbook-readers/builder'
|
42
|
-
require 'factbook-readers/builder_json'
|
43
|
-
require 'factbook-readers/page'
|
14
|
+
require 'factbook-readers/convert'
|
44
15
|
require 'factbook-readers/page_info'
|
45
|
-
require 'factbook-readers/
|
46
|
-
require 'factbook-readers/subsect'
|
47
|
-
|
48
|
-
|
49
|
-
require 'factbook-readers/reader_json'
|
50
|
-
|
51
|
-
require 'factbook-readers/table' ## e.g. TableReader
|
16
|
+
require 'factbook-readers/page'
|
52
17
|
|
53
18
|
require 'factbook-readers/counter'
|
54
19
|
|
@@ -0,0 +1,37 @@
|
|
1
|
+
|
2
|
+
|
3
|
+
def convert_cia( cia )
|
4
|
+
## convert from "raw" on-the-wire cia format to
|
5
|
+
## "standard" compact "classic" format
|
6
|
+
|
7
|
+
data = {}
|
8
|
+
|
9
|
+
cia['categories'].each do |cia_cat|
|
10
|
+
cat = data[ cia_cat['title'] ] = {}
|
11
|
+
cia_cat['fields'].each do |cia_field|
|
12
|
+
field = cat[ cia_field['name'] ] = {}
|
13
|
+
if cia_field['subfields']
|
14
|
+
cia_field['subfields'].each do |cia_subfield|
|
15
|
+
subfield = field[ cia_subfield['name'] ] = {}
|
16
|
+
subfield[ 'text' ] = cia_subfield['content']
|
17
|
+
end
|
18
|
+
|
19
|
+
puts "== #{cia_cat['title']} / #{cia_field['name']} - skipping field content (w/ subfields):"
|
20
|
+
puts " >#{cia_field['content']}<"
|
21
|
+
puts " ?? same as:"
|
22
|
+
cia_field['subfields'].each do |cia_subfield|
|
23
|
+
puts " #{cia_subfield['name']}: >#{cia_subfield['content']}<"
|
24
|
+
end
|
25
|
+
|
26
|
+
else
|
27
|
+
field[ 'text' ] = cia_field['content']
|
28
|
+
end
|
29
|
+
|
30
|
+
if cia_field[ 'field_note' ]
|
31
|
+
field[ 'note' ] = cia_field[ 'field_note' ]
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
data
|
37
|
+
end
|
@@ -9,20 +9,20 @@ def initialize
|
|
9
9
|
@data = {}
|
10
10
|
end
|
11
11
|
|
12
|
-
def count( page )
|
12
|
+
def count( code, page )
|
13
13
|
|
14
14
|
## walk page data hash
|
15
15
|
# add nodes to data
|
16
16
|
|
17
|
-
walk(
|
17
|
+
walk( code, page.to_h, @data )
|
18
18
|
end
|
19
19
|
|
20
20
|
|
21
21
|
private
|
22
|
-
def walk(
|
22
|
+
def walk( code, hin, hout )
|
23
23
|
hin.each do |k,v|
|
24
|
-
if v.is_a? Hash
|
25
|
-
hout2 = hout[k]
|
24
|
+
if v.is_a?( Hash )
|
25
|
+
hout2 = hout[k] ||= { count: 0, codes: '' }
|
26
26
|
|
27
27
|
hout2[ :count ] += 1
|
28
28
|
|
@@ -32,12 +32,10 @@ def walk( page, hin, hout )
|
|
32
32
|
codes = hout2[ :codes ]
|
33
33
|
if codes ## note: might got deleted if passed treshhold (e.g. 9 entries)
|
34
34
|
codes << ' ' unless codes.empty? ## add separator (space for now)
|
35
|
-
codes <<
|
35
|
+
codes << code
|
36
36
|
hout2[ :codes ] = codes
|
37
37
|
end
|
38
|
-
|
39
|
-
hout[k] = hout2
|
40
|
-
walk( page, v, hout2 )
|
38
|
+
walk( code, v, hout2 )
|
41
39
|
end
|
42
40
|
end
|
43
41
|
end
|
@@ -5,23 +5,12 @@ module Factbook
|
|
5
5
|
class Page
|
6
6
|
include LogUtils::Logging
|
7
7
|
|
8
|
-
attr_reader :
|
9
|
-
attr_reader :info ## meta info e.g. country_code, country_name, region_name, last_updated, etc.
|
10
|
-
attr_reader :data ## "plain" access with vanilla hash
|
11
|
-
|
8
|
+
attr_reader :info ## meta info e.g. country_code, country_name, region_name, updated, etc.
|
12
9
|
|
13
10
|
## standard version (note: requires https)
|
14
|
-
SITE_BASE =
|
15
|
-
|
11
|
+
SITE_BASE = "https://www.cia.gov/the-world-factbook/geos/{code}.json"
|
16
12
|
|
17
|
-
def self.parse( html ) ## parse html from string
|
18
|
-
new( html: html )
|
19
|
-
end
|
20
13
|
|
21
|
-
def self.read( path )
|
22
|
-
html = File.open( path, 'r:utf-8' ) { |f| f.read }
|
23
|
-
new( html: html )
|
24
|
-
end
|
25
14
|
|
26
15
|
def self.parse_json( json ) ## parse json from string
|
27
16
|
new( json: json )
|
@@ -36,87 +25,78 @@ class Page
|
|
36
25
|
new( code, cache: cache )
|
37
26
|
end
|
38
27
|
|
39
|
-
## some convenience alias(es)
|
40
|
-
class << self
|
41
|
-
alias_method :read_html, :read
|
42
|
-
alias_method :parse_html, :parse
|
43
|
-
end
|
44
28
|
|
45
29
|
|
46
30
|
def initialize( code=nil,
|
47
31
|
json: nil,
|
48
|
-
html: nil,
|
49
32
|
cache: false,
|
50
33
|
info: nil )
|
51
34
|
if json
|
52
35
|
## note: assumes json is (still) a string/text
|
53
36
|
## (NOT yet parsed to structured data)
|
54
|
-
b =
|
55
|
-
else ## assume
|
56
|
-
if html
|
57
|
-
## for debugging and testing allow "custom" passed-in html page
|
58
|
-
else
|
37
|
+
b = ProfileBuilder.new( json )
|
38
|
+
else ## assume "raw" json dataset
|
59
39
|
## allow passing in code struct too - just use/pluck two-letter code from struct !!!
|
60
40
|
code = code.code if code.is_a?( Codes::Code )
|
61
41
|
|
62
42
|
raise ArgumentError, "two letter code (e.g. au) required to download page & build page url" if code.nil?
|
63
43
|
url = SITE_BASE.sub( '{code}', code )
|
64
44
|
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
45
|
+
raw_data = if cache && Webcache.exist?( url )
|
46
|
+
text = Webcache.read( url ) ## for debugging - read from cache
|
47
|
+
JSON.parse( text )
|
48
|
+
else
|
49
|
+
download_data( url )
|
50
|
+
end
|
51
|
+
|
52
|
+
## meta info from raw date - example:
|
53
|
+
## "name": "Aruba",
|
54
|
+
## "code": "AA",
|
55
|
+
## "region": "Central America",
|
56
|
+
## "published": "2021-01-25 09:07:08 -0500",
|
57
|
+
## "updated": "2021-01-22 14:38:14 -0500",
|
58
|
+
##
|
59
|
+
## note: published is NOT before updated (like an alias for created) BUT is often older/later than updated - why!?
|
60
|
+
|
61
|
+
@info = PageInfo.new
|
62
|
+
|
63
|
+
@info.country_code = raw_data['code'].downcase
|
64
|
+
@info.country_name = raw_data['name']
|
65
|
+
@info.region_name = raw_data['region']
|
66
|
+
|
67
|
+
## note: just parse year,month,day for now (skip hours,minutes,etc.)
|
68
|
+
@info.published = Date.strptime( raw_data['published'], '%Y-%m-%d' )
|
69
|
+
@info.updated = Date.strptime( raw_data['updated'], '%Y-%m-%d' )
|
70
|
+
|
71
|
+
data = convert_cia( raw_data )
|
72
|
+
b = ProfileBuilder.new( data )
|
72
73
|
end
|
73
74
|
|
74
|
-
@
|
75
|
-
@info = b.info
|
75
|
+
@profile = b.profile
|
76
76
|
|
77
77
|
## todo/fix/quick hack:
|
78
78
|
## check for info opts - lets you overwrite page info
|
79
79
|
## -- use proper header to setup page info - why, why not??
|
80
80
|
@info = info if info
|
81
|
-
|
82
|
-
|
83
|
-
@data = {}
|
84
|
-
@sects.each do |sect|
|
85
|
-
@data[ sect.title ] = sect.data
|
86
|
-
end
|
87
81
|
end
|
88
82
|
|
89
83
|
|
90
|
-
def to_json( minify: false ) ## convenience helper for data.to_json; note: pretty print by default!
|
91
|
-
if minify
|
92
|
-
data.to_json
|
93
|
-
else ## note: pretty print by default!
|
94
|
-
JSON.pretty_generate( data )
|
95
|
-
end
|
96
|
-
end
|
97
84
|
|
98
|
-
|
99
|
-
def [](key)
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
# page.data['geo']
|
104
|
-
|
105
|
-
## fix: use delegate data, [] from forwardable lib - why?? why not??
|
106
|
-
|
107
|
-
data[key]
|
108
|
-
end
|
85
|
+
## convenience helpers - forward to profile
|
86
|
+
def [](key) @profile[key]; end
|
87
|
+
def to_h() @profile.to_h; end
|
88
|
+
def to_json( minify: false ) @profile.to_json( minify: minify ); end
|
89
|
+
def size() @profile.size; end
|
109
90
|
|
110
91
|
|
111
92
|
private
|
112
|
-
def
|
113
|
-
response = Webget.
|
93
|
+
def download_data( url )
|
94
|
+
response = Webget.call( url )
|
114
95
|
|
115
96
|
## note: exit on get / fetch error - do NOT continue for now - why? why not?
|
116
97
|
exit 1 if response.status.nok? ## e.g. HTTP status code != 200
|
117
98
|
|
118
|
-
|
119
|
-
response.text
|
99
|
+
response.json
|
120
100
|
end
|
121
101
|
end # class Page
|
122
102
|
end # module Factbook
|