factbook-readers 1.0.1 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. checksums.yaml +5 -5
  2. data/Manifest.txt +3 -25
  3. data/README.md +11 -69
  4. data/Rakefile +3 -3
  5. data/lib/factbook-readers.rb +5 -40
  6. data/lib/factbook-readers/convert.rb +37 -0
  7. data/lib/factbook-readers/counter.rb +7 -9
  8. data/lib/factbook-readers/page.rb +41 -61
  9. data/lib/factbook-readers/page_info.rb +15 -3
  10. data/lib/factbook-readers/version.rb +2 -2
  11. data/test/helper.rb +3 -0
  12. data/test/test_counter.rb +9 -6
  13. data/test/test_download.rb +27 -0
  14. data/test/test_fields.rb +44 -27
  15. data/test/test_json.rb +4 -4
  16. data/test/test_page.rb +8 -8
  17. data/test/test_version.rb +15 -0
  18. metadata +11 -48
  19. data/data/categories.csv +0 -164
  20. data/data/codes.csv +0 -262
  21. data/data/codesxref.csv +0 -280
  22. data/data/comparisons.csv +0 -75
  23. data/lib/factbook-readers/builder.rb +0 -187
  24. data/lib/factbook-readers/builder_item.rb +0 -201
  25. data/lib/factbook-readers/builder_json.rb +0 -68
  26. data/lib/factbook-readers/codes.rb +0 -121
  27. data/lib/factbook-readers/comparisons.rb +0 -49
  28. data/lib/factbook-readers/normalize.rb +0 -42
  29. data/lib/factbook-readers/reader_json.rb +0 -50
  30. data/lib/factbook-readers/sanitizer.rb +0 -351
  31. data/lib/factbook-readers/sect.rb +0 -28
  32. data/lib/factbook-readers/subsect.rb +0 -17
  33. data/lib/factbook-readers/table.rb +0 -51
  34. data/lib/factbook-readers/utils.rb +0 -47
  35. data/lib/factbook-readers/utils_info.rb +0 -128
  36. data/test/test_builder.rb +0 -30
  37. data/test/test_codes.rb +0 -72
  38. data/test/test_comparisons.rb +0 -16
  39. data/test/test_item_builder.rb +0 -97
  40. data/test/test_json_builder.rb +0 -23
  41. data/test/test_normalize.rb +0 -21
  42. data/test/test_sanitizer.rb +0 -36
  43. data/test/test_sanitizer_regex.rb +0 -87
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: 2bc67eb2f60367d8d0ef00ca718c7d8b81b4a9c8
4
- data.tar.gz: f61389d6a073db31e79766c2711bbabb89b27699
2
+ SHA256:
3
+ metadata.gz: e5d0dc182771764b690661c69fdc905c4197b0a47e22f8a06903087719a3bb11
4
+ data.tar.gz: 7e995f86a4a1a9bb307914caff0ebb2ce39b0d2fafb62e8b1d4790608d7d6c5c
5
5
  SHA512:
6
- metadata.gz: 3a565e36afae190e18154bc366bbd3d1a77f06e0c51017ba34893448fd2588fa57e2c93647ef3de5338b423e63787ef248a56c512b67504a522122bb4b24e0ff
7
- data.tar.gz: 1cd6b487cb5fb2a2c5b659d2dacf0481ff5368f2e85f977c145bcf46e94f16d0543bbb59dd61995fb1b137c3bb4654308119c89b6ffa7883b0023684101b17dc
6
+ metadata.gz: b640ed837c55588df3f27381d86aeec14677d54d47236cf486ced33952a2619341fb396d79e280a2aa93c01600151a6ea3b0baa6a6d66c7c5ad6115479a72bbd
7
+ data.tar.gz: 147439c2077725912bf0ede44bb30376112abc1ec7810f9f17d46643cbc055e46ff0e2051e07098a284ff4c1990edd0051246cbd04eebd05b0603a72f1f1d128
data/Manifest.txt CHANGED
@@ -2,39 +2,17 @@ CHANGELOG.md
2
2
  Manifest.txt
3
3
  README.md
4
4
  Rakefile
5
- data/categories.csv
6
- data/codes.csv
7
- data/codesxref.csv
8
- data/comparisons.csv
9
5
  lib/factbook-readers.rb
10
- lib/factbook-readers/builder.rb
11
- lib/factbook-readers/builder_item.rb
12
- lib/factbook-readers/builder_json.rb
13
- lib/factbook-readers/codes.rb
14
- lib/factbook-readers/comparisons.rb
6
+ lib/factbook-readers/convert.rb
15
7
  lib/factbook-readers/counter.rb
16
- lib/factbook-readers/normalize.rb
17
8
  lib/factbook-readers/page.rb
18
9
  lib/factbook-readers/page_info.rb
19
- lib/factbook-readers/reader_json.rb
20
- lib/factbook-readers/sanitizer.rb
21
- lib/factbook-readers/sect.rb
22
- lib/factbook-readers/subsect.rb
23
- lib/factbook-readers/table.rb
24
- lib/factbook-readers/utils.rb
25
- lib/factbook-readers/utils_info.rb
26
10
  lib/factbook-readers/version.rb
27
11
  lib/factbook/readers.rb
28
12
  test/helper.rb
29
- test/test_builder.rb
30
- test/test_codes.rb
31
- test/test_comparisons.rb
32
13
  test/test_counter.rb
14
+ test/test_download.rb
33
15
  test/test_fields.rb
34
- test/test_item_builder.rb
35
16
  test/test_json.rb
36
- test/test_json_builder.rb
37
- test/test_normalize.rb
38
17
  test/test_page.rb
39
- test/test_sanitizer.rb
40
- test/test_sanitizer_regex.rb
18
+ test/test_version.rb
data/README.md CHANGED
@@ -7,6 +7,11 @@
7
7
  * forum :: [groups.google.com/group/openmundi](https://groups.google.com/group/openmundi)
8
8
 
9
9
 
10
+ ## What's the World Factbook?
11
+
12
+ See [factbook/factbook.json »](https://github.com/factbook/factbook.json)
13
+
14
+
10
15
 
11
16
  ## Usage
12
17
 
@@ -14,7 +19,7 @@
14
19
 
15
20
  ```ruby
16
21
  page = Factbook::Page.new( 'br' ) # br is the country code for Brazil
17
- pp page.data # pretty print hash
22
+ pp page.to_h # pretty print data hash
18
23
  ```
19
24
 
20
25
  resulting in:
@@ -94,78 +99,15 @@ end
94
99
  ```
95
100
 
96
101
 
97
- ### List all codes
98
-
99
- ```ruby
100
- Factbook.codes.each do |code|
101
- pp code
102
- end
103
- ```
104
-
105
- resulting in:
106
-
107
- ```
108
- #<struct Factbook::Codes::Code
109
- code ="af",
110
- name ="Afghanistan",
111
- category="Countries",
112
- region ="South Asia">
113
- #<struct Factbook::Codes::Code
114
- code ="al",
115
- name ="Albania",
116
- category="Countries",
117
- region ="Europe">
118
- #<struct Factbook::Codes::Code
119
- code ="ag",
120
- name ="Algeria",
121
- category="Countries",
122
- region ="Africa">
123
- #<struct Factbook::Codes::Code
124
- code ="an",
125
- name ="Andorra",
126
- category="Countries",
127
- region ="Europe">
128
- ...
129
- ```
130
-
131
- Note: You can filter codes by category e.g. Countries, Dependencies, Miscellaneous, Oceans, etc.
132
- and/or by region e.g. Africa, Europe, South Asia, Central America and Caribbean, etc.
133
102
 
134
103
 
135
- ```ruby
136
-
137
- assert_equal 261, Factbook.codes.size
138
-
139
- ## categories
140
- assert_equal 195, Factbook.codes.countries.size
141
- assert_equal 52, Factbook.codes.dependencies.size
142
- assert_equal 5, Factbook.codes.oceans.size
143
- assert_equal 1, Factbook.codes.world.size
144
- assert_equal 2, Factbook.codes.others.size
145
- assert_equal 6, Factbook.codes.misc.size
146
-
147
- ## regions
148
- assert_equal 55, Factbook.codes.europe.size
149
- assert_equal 9, Factbook.codes.south_asia.size
150
- assert_equal 6, Factbook.codes.central_asia.size
151
- assert_equal 22, Factbook.codes.east_n_souteast_asia.size
152
- assert_equal 19, Factbook.codes.middle_east.size
153
- assert_equal 56, Factbook.codes.africa.size
154
- assert_equal 7, Factbook.codes.north_america.size
155
- assert_equal 33, Factbook.codes.central_america_n_caribbean.size
156
- assert_equal 14, Factbook.codes.south_america.size
157
- assert_equal 30, Factbook.codes.australia_oceania.size
158
- assert_equal 4, Factbook.codes.antartica.size
159
- assert_equal 5, Factbook.codes.region('Oceans').size
160
- assert_equal 1, Factbook.codes.region('World').size
161
-
162
- ## categories + regions
163
- assert_equal 45, Factbook.codes.countries.europe.size
164
- ...
165
- ```
166
104
 
167
- See [`data/codes.csv`](data/codes.csv) for the built-in listing of all codes with categories and regions.
105
+ ## Ready-To-Use Public Domain (Free) Factbook Datasets
168
106
 
107
+ [factbook/factbook.json](https://github.com/factbook/factbook.json) - open (public domain)
108
+ factbook country profiles in JSON for all the world's countries (note: using the original
109
+ / official two-letter GEC (formerly FIPS) codes and NOT the ISO codes - you might be used to for country codes e.g. Austria is `au.json` and NOT `at.json`,
110
+ Germany is `gm.json` and NOT `de.json` so on)
169
111
 
170
112
 
171
113
 
data/Rakefile CHANGED
@@ -18,10 +18,10 @@ Hoe.spec 'factbook-readers' do
18
18
  self.history_file = 'CHANGELOG.md'
19
19
 
20
20
  self.extra_deps = [
21
- ['logutils' ],
22
- ['csvreader'],
21
+ ['factbook-codes' ],
22
+ ['factbook-fields' ],
23
23
  ['webget'],
24
- ['nokogiri'],
24
+ ## ['nokogiri'],
25
25
  ## ['activerecord'] # note: will include activesupport,etc.
26
26
  ]
27
27
 
@@ -1,13 +1,8 @@
1
- ## 3rd party gems/libs
2
- ## require 'props'
1
+ require 'factbook-fields'
3
2
 
4
- require 'logutils'
3
+ ## more 3rd party gems/libs
4
+ ## require 'props'
5
5
  require 'webget'
6
- require 'csvreader'
7
-
8
-
9
- require 'nokogiri'
10
-
11
6
 
12
7
 
13
8
 
@@ -15,40 +10,10 @@ require 'nokogiri'
15
10
  require 'factbook-readers/version' # let it always go first
16
11
 
17
12
 
18
- require 'factbook-readers/codes'
19
- require 'factbook-readers/comparisons'
20
-
21
-
22
- ## note: make codes, comparisons available
23
- module Factbook
24
- ## note: load on demand only builtin codes, comparisons, etc.
25
- ## for now
26
- def self.codes
27
- @@codes ||= Codes.read_csv( "#{Factbook::Module::Readers.root}/data/codes.csv" );
28
- end
29
- def self.comparisons
30
- @@comparisons ||= Comparisons.read_csv( "#{Factbook::Module::Readers.root}/data/comparisons.csv" )
31
- end
32
- end # module Factbook
33
13
 
34
-
35
-
36
- require 'factbook-readers/utils'
37
- require 'factbook-readers/utils_info'
38
- require 'factbook-readers/sanitizer'
39
- require 'factbook-readers/normalize'
40
- require 'factbook-readers/builder_item'
41
- require 'factbook-readers/builder'
42
- require 'factbook-readers/builder_json'
43
- require 'factbook-readers/page'
14
+ require 'factbook-readers/convert'
44
15
  require 'factbook-readers/page_info'
45
- require 'factbook-readers/sect'
46
- require 'factbook-readers/subsect'
47
-
48
-
49
- require 'factbook-readers/reader_json'
50
-
51
- require 'factbook-readers/table' ## e.g. TableReader
16
+ require 'factbook-readers/page'
52
17
 
53
18
  require 'factbook-readers/counter'
54
19
 
@@ -0,0 +1,37 @@
1
+
2
+
3
+ def convert_cia( cia )
4
+ ## convert from "raw" on-the-wire cia format to
5
+ ## "standard" compact "classic" format
6
+
7
+ data = {}
8
+
9
+ cia['categories'].each do |cia_cat|
10
+ cat = data[ cia_cat['title'] ] = {}
11
+ cia_cat['fields'].each do |cia_field|
12
+ field = cat[ cia_field['name'] ] = {}
13
+ if cia_field['subfields']
14
+ cia_field['subfields'].each do |cia_subfield|
15
+ subfield = field[ cia_subfield['name'] ] = {}
16
+ subfield[ 'text' ] = cia_subfield['content']
17
+ end
18
+
19
+ puts "== #{cia_cat['title']} / #{cia_field['name']} - skipping field content (w/ subfields):"
20
+ puts " >#{cia_field['content']}<"
21
+ puts " ?? same as:"
22
+ cia_field['subfields'].each do |cia_subfield|
23
+ puts " #{cia_subfield['name']}: >#{cia_subfield['content']}<"
24
+ end
25
+
26
+ else
27
+ field[ 'text' ] = cia_field['content']
28
+ end
29
+
30
+ if cia_field[ 'field_note' ]
31
+ field[ 'note' ] = cia_field[ 'field_note' ]
32
+ end
33
+ end
34
+ end
35
+
36
+ data
37
+ end
@@ -9,20 +9,20 @@ def initialize
9
9
  @data = {}
10
10
  end
11
11
 
12
- def count( page )
12
+ def count( code, page )
13
13
 
14
14
  ## walk page data hash
15
15
  # add nodes to data
16
16
 
17
- walk( page, page.data, @data )
17
+ walk( code, page.to_h, @data )
18
18
  end
19
19
 
20
20
 
21
21
  private
22
- def walk( page, hin, hout )
22
+ def walk( code, hin, hout )
23
23
  hin.each do |k,v|
24
- if v.is_a? Hash
25
- hout2 = hout[k] || { count: 0, codes: '' }
24
+ if v.is_a?( Hash )
25
+ hout2 = hout[k] ||= { count: 0, codes: '' }
26
26
 
27
27
  hout2[ :count ] += 1
28
28
 
@@ -32,12 +32,10 @@ def walk( page, hin, hout )
32
32
  codes = hout2[ :codes ]
33
33
  if codes ## note: might got deleted if passed treshhold (e.g. 9 entries)
34
34
  codes << ' ' unless codes.empty? ## add separator (space for now)
35
- codes << page.info.country_code
35
+ codes << code
36
36
  hout2[ :codes ] = codes
37
37
  end
38
-
39
- hout[k] = hout2
40
- walk( page, v, hout2 )
38
+ walk( code, v, hout2 )
41
39
  end
42
40
  end
43
41
  end
@@ -5,23 +5,12 @@ module Factbook
5
5
  class Page
6
6
  include LogUtils::Logging
7
7
 
8
- attr_reader :sects ## "structured" access e.g. sects/subsects/etc.
9
- attr_reader :info ## meta info e.g. country_code, country_name, region_name, last_updated, etc.
10
- attr_reader :data ## "plain" access with vanilla hash
11
-
8
+ attr_reader :info ## meta info e.g. country_code, country_name, region_name, updated, etc.
12
9
 
13
10
  ## standard version (note: requires https)
14
- SITE_BASE = 'https://www.cia.gov/library/publications/the-world-factbook/geos/{code}.html'
15
-
11
+ SITE_BASE = "https://www.cia.gov/the-world-factbook/geos/{code}.json"
16
12
 
17
- def self.parse( html ) ## parse html from string
18
- new( html: html )
19
- end
20
13
 
21
- def self.read( path )
22
- html = File.open( path, 'r:utf-8' ) { |f| f.read }
23
- new( html: html )
24
- end
25
14
 
26
15
  def self.parse_json( json ) ## parse json from string
27
16
  new( json: json )
@@ -36,87 +25,78 @@ class Page
36
25
  new( code, cache: cache )
37
26
  end
38
27
 
39
- ## some convenience alias(es)
40
- class << self
41
- alias_method :read_html, :read
42
- alias_method :parse_html, :parse
43
- end
44
28
 
45
29
 
46
30
  def initialize( code=nil,
47
31
  json: nil,
48
- html: nil,
49
32
  cache: false,
50
33
  info: nil )
51
34
  if json
52
35
  ## note: assumes json is (still) a string/text
53
36
  ## (NOT yet parsed to structured data)
54
- b = JsonBuilder.new( json )
55
- else ## assume html
56
- if html
57
- ## for debugging and testing allow "custom" passed-in html page
58
- else
37
+ b = ProfileBuilder.new( json )
38
+ else ## assume "raw" json dataset
59
39
  ## allow passing in code struct too - just use/pluck two-letter code from struct !!!
60
40
  code = code.code if code.is_a?( Codes::Code )
61
41
 
62
42
  raise ArgumentError, "two letter code (e.g. au) required to download page & build page url" if code.nil?
63
43
  url = SITE_BASE.sub( '{code}', code )
64
44
 
65
- html = if cache && Webcache.exist?( url )
66
- Webcache.read( url ) ## for debugging - read from cache
67
- else
68
- download_page( url )
69
- end
70
- end
71
- b = Builder.new( html )
45
+ raw_data = if cache && Webcache.exist?( url )
46
+ text = Webcache.read( url ) ## for debugging - read from cache
47
+ JSON.parse( text )
48
+ else
49
+ download_data( url )
50
+ end
51
+
52
+ ## meta info from raw date - example:
53
+ ## "name": "Aruba",
54
+ ## "code": "AA",
55
+ ## "region": "Central America",
56
+ ## "published": "2021-01-25 09:07:08 -0500",
57
+ ## "updated": "2021-01-22 14:38:14 -0500",
58
+ ##
59
+ ## note: published is NOT before updated (like an alias for created) BUT is often older/later than updated - why!?
60
+
61
+ @info = PageInfo.new
62
+
63
+ @info.country_code = raw_data['code'].downcase
64
+ @info.country_name = raw_data['name']
65
+ @info.region_name = raw_data['region']
66
+
67
+ ## note: just parse year,month,day for now (skip hours,minutes,etc.)
68
+ @info.published = Date.strptime( raw_data['published'], '%Y-%m-%d' )
69
+ @info.updated = Date.strptime( raw_data['updated'], '%Y-%m-%d' )
70
+
71
+ data = convert_cia( raw_data )
72
+ b = ProfileBuilder.new( data )
72
73
  end
73
74
 
74
- @sects = b.sects
75
- @info = b.info
75
+ @profile = b.profile
76
76
 
77
77
  ## todo/fix/quick hack:
78
78
  ## check for info opts - lets you overwrite page info
79
79
  ## -- use proper header to setup page info - why, why not??
80
80
  @info = info if info
81
-
82
-
83
- @data = {}
84
- @sects.each do |sect|
85
- @data[ sect.title ] = sect.data
86
- end
87
81
  end
88
82
 
89
83
 
90
- def to_json( minify: false ) ## convenience helper for data.to_json; note: pretty print by default!
91
- if minify
92
- data.to_json
93
- else ## note: pretty print by default!
94
- JSON.pretty_generate( data )
95
- end
96
- end
97
84
 
98
-
99
- def [](key) ### convenience shortcut
100
- # lets you use
101
- # page['geo']
102
- # instead of
103
- # page.data['geo']
104
-
105
- ## fix: use delegate data, [] from forwardable lib - why?? why not??
106
-
107
- data[key]
108
- end
85
+ ## convenience helpers - forward to profile
86
+ def [](key) @profile[key]; end
87
+ def to_h() @profile.to_h; end
88
+ def to_json( minify: false ) @profile.to_json( minify: minify ); end
89
+ def size() @profile.size; end
109
90
 
110
91
 
111
92
  private
112
- def download_page( url )
113
- response = Webget.page( url )
93
+ def download_data( url )
94
+ response = Webget.call( url )
114
95
 
115
96
  ## note: exit on get / fetch error - do NOT continue for now - why? why not?
116
97
  exit 1 if response.status.nok? ## e.g. HTTP status code != 200
117
98
 
118
-
119
- response.text
99
+ response.json
120
100
  end
121
101
  end # class Page
122
102
  end # module Factbook