factbook-readers 1.0.1 → 1.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (43) hide show
  1. checksums.yaml +5 -5
  2. data/Manifest.txt +3 -25
  3. data/README.md +11 -69
  4. data/Rakefile +3 -3
  5. data/lib/factbook-readers.rb +5 -40
  6. data/lib/factbook-readers/convert.rb +37 -0
  7. data/lib/factbook-readers/counter.rb +7 -9
  8. data/lib/factbook-readers/page.rb +41 -61
  9. data/lib/factbook-readers/page_info.rb +15 -3
  10. data/lib/factbook-readers/version.rb +2 -2
  11. data/test/helper.rb +3 -0
  12. data/test/test_counter.rb +9 -6
  13. data/test/test_download.rb +27 -0
  14. data/test/test_fields.rb +44 -27
  15. data/test/test_json.rb +4 -4
  16. data/test/test_page.rb +8 -8
  17. data/test/test_version.rb +15 -0
  18. metadata +11 -48
  19. data/data/categories.csv +0 -164
  20. data/data/codes.csv +0 -262
  21. data/data/codesxref.csv +0 -280
  22. data/data/comparisons.csv +0 -75
  23. data/lib/factbook-readers/builder.rb +0 -187
  24. data/lib/factbook-readers/builder_item.rb +0 -201
  25. data/lib/factbook-readers/builder_json.rb +0 -68
  26. data/lib/factbook-readers/codes.rb +0 -121
  27. data/lib/factbook-readers/comparisons.rb +0 -49
  28. data/lib/factbook-readers/normalize.rb +0 -42
  29. data/lib/factbook-readers/reader_json.rb +0 -50
  30. data/lib/factbook-readers/sanitizer.rb +0 -351
  31. data/lib/factbook-readers/sect.rb +0 -28
  32. data/lib/factbook-readers/subsect.rb +0 -17
  33. data/lib/factbook-readers/table.rb +0 -51
  34. data/lib/factbook-readers/utils.rb +0 -47
  35. data/lib/factbook-readers/utils_info.rb +0 -128
  36. data/test/test_builder.rb +0 -30
  37. data/test/test_codes.rb +0 -72
  38. data/test/test_comparisons.rb +0 -16
  39. data/test/test_item_builder.rb +0 -97
  40. data/test/test_json_builder.rb +0 -23
  41. data/test/test_normalize.rb +0 -21
  42. data/test/test_sanitizer.rb +0 -36
  43. data/test/test_sanitizer_regex.rb +0 -87
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: 2bc67eb2f60367d8d0ef00ca718c7d8b81b4a9c8
4
- data.tar.gz: f61389d6a073db31e79766c2711bbabb89b27699
2
+ SHA256:
3
+ metadata.gz: e5d0dc182771764b690661c69fdc905c4197b0a47e22f8a06903087719a3bb11
4
+ data.tar.gz: 7e995f86a4a1a9bb307914caff0ebb2ce39b0d2fafb62e8b1d4790608d7d6c5c
5
5
  SHA512:
6
- metadata.gz: 3a565e36afae190e18154bc366bbd3d1a77f06e0c51017ba34893448fd2588fa57e2c93647ef3de5338b423e63787ef248a56c512b67504a522122bb4b24e0ff
7
- data.tar.gz: 1cd6b487cb5fb2a2c5b659d2dacf0481ff5368f2e85f977c145bcf46e94f16d0543bbb59dd61995fb1b137c3bb4654308119c89b6ffa7883b0023684101b17dc
6
+ metadata.gz: b640ed837c55588df3f27381d86aeec14677d54d47236cf486ced33952a2619341fb396d79e280a2aa93c01600151a6ea3b0baa6a6d66c7c5ad6115479a72bbd
7
+ data.tar.gz: 147439c2077725912bf0ede44bb30376112abc1ec7810f9f17d46643cbc055e46ff0e2051e07098a284ff4c1990edd0051246cbd04eebd05b0603a72f1f1d128
data/Manifest.txt CHANGED
@@ -2,39 +2,17 @@ CHANGELOG.md
2
2
  Manifest.txt
3
3
  README.md
4
4
  Rakefile
5
- data/categories.csv
6
- data/codes.csv
7
- data/codesxref.csv
8
- data/comparisons.csv
9
5
  lib/factbook-readers.rb
10
- lib/factbook-readers/builder.rb
11
- lib/factbook-readers/builder_item.rb
12
- lib/factbook-readers/builder_json.rb
13
- lib/factbook-readers/codes.rb
14
- lib/factbook-readers/comparisons.rb
6
+ lib/factbook-readers/convert.rb
15
7
  lib/factbook-readers/counter.rb
16
- lib/factbook-readers/normalize.rb
17
8
  lib/factbook-readers/page.rb
18
9
  lib/factbook-readers/page_info.rb
19
- lib/factbook-readers/reader_json.rb
20
- lib/factbook-readers/sanitizer.rb
21
- lib/factbook-readers/sect.rb
22
- lib/factbook-readers/subsect.rb
23
- lib/factbook-readers/table.rb
24
- lib/factbook-readers/utils.rb
25
- lib/factbook-readers/utils_info.rb
26
10
  lib/factbook-readers/version.rb
27
11
  lib/factbook/readers.rb
28
12
  test/helper.rb
29
- test/test_builder.rb
30
- test/test_codes.rb
31
- test/test_comparisons.rb
32
13
  test/test_counter.rb
14
+ test/test_download.rb
33
15
  test/test_fields.rb
34
- test/test_item_builder.rb
35
16
  test/test_json.rb
36
- test/test_json_builder.rb
37
- test/test_normalize.rb
38
17
  test/test_page.rb
39
- test/test_sanitizer.rb
40
- test/test_sanitizer_regex.rb
18
+ test/test_version.rb
data/README.md CHANGED
@@ -7,6 +7,11 @@
7
7
  * forum :: [groups.google.com/group/openmundi](https://groups.google.com/group/openmundi)
8
8
 
9
9
 
10
+ ## What's the World Factbook?
11
+
12
+ See [factbook/factbook.json »](https://github.com/factbook/factbook.json)
13
+
14
+
10
15
 
11
16
  ## Usage
12
17
 
@@ -14,7 +19,7 @@
14
19
 
15
20
  ```ruby
16
21
  page = Factbook::Page.new( 'br' ) # br is the country code for Brazil
17
- pp page.data # pretty print hash
22
+ pp page.to_h # pretty print data hash
18
23
  ```
19
24
 
20
25
  resulting in:
@@ -94,78 +99,15 @@ end
94
99
  ```
95
100
 
96
101
 
97
- ### List all codes
98
-
99
- ```ruby
100
- Factbook.codes.each do |code|
101
- pp code
102
- end
103
- ```
104
-
105
- resulting in:
106
-
107
- ```
108
- #<struct Factbook::Codes::Code
109
- code ="af",
110
- name ="Afghanistan",
111
- category="Countries",
112
- region ="South Asia">
113
- #<struct Factbook::Codes::Code
114
- code ="al",
115
- name ="Albania",
116
- category="Countries",
117
- region ="Europe">
118
- #<struct Factbook::Codes::Code
119
- code ="ag",
120
- name ="Algeria",
121
- category="Countries",
122
- region ="Africa">
123
- #<struct Factbook::Codes::Code
124
- code ="an",
125
- name ="Andorra",
126
- category="Countries",
127
- region ="Europe">
128
- ...
129
- ```
130
-
131
- Note: You can filter codes by category e.g. Countries, Dependencies, Miscellaneous, Oceans, etc.
132
- and/or by region e.g. Africa, Europe, South Asia, Central America and Caribbean, etc.
133
102
 
134
103
 
135
- ```ruby
136
-
137
- assert_equal 261, Factbook.codes.size
138
-
139
- ## categories
140
- assert_equal 195, Factbook.codes.countries.size
141
- assert_equal 52, Factbook.codes.dependencies.size
142
- assert_equal 5, Factbook.codes.oceans.size
143
- assert_equal 1, Factbook.codes.world.size
144
- assert_equal 2, Factbook.codes.others.size
145
- assert_equal 6, Factbook.codes.misc.size
146
-
147
- ## regions
148
- assert_equal 55, Factbook.codes.europe.size
149
- assert_equal 9, Factbook.codes.south_asia.size
150
- assert_equal 6, Factbook.codes.central_asia.size
151
- assert_equal 22, Factbook.codes.east_n_souteast_asia.size
152
- assert_equal 19, Factbook.codes.middle_east.size
153
- assert_equal 56, Factbook.codes.africa.size
154
- assert_equal 7, Factbook.codes.north_america.size
155
- assert_equal 33, Factbook.codes.central_america_n_caribbean.size
156
- assert_equal 14, Factbook.codes.south_america.size
157
- assert_equal 30, Factbook.codes.australia_oceania.size
158
- assert_equal 4, Factbook.codes.antartica.size
159
- assert_equal 5, Factbook.codes.region('Oceans').size
160
- assert_equal 1, Factbook.codes.region('World').size
161
-
162
- ## categories + regions
163
- assert_equal 45, Factbook.codes.countries.europe.size
164
- ...
165
- ```
166
104
 
167
- See [`data/codes.csv`](data/codes.csv) for the built-in listing of all codes with categories and regions.
105
+ ## Ready-To-Use Public Domain (Free) Factbook Datasets
168
106
 
107
+ [factbook/factbook.json](https://github.com/factbook/factbook.json) - open (public domain)
108
+ factbook country profiles in JSON for all the world's countries (note: using the original
109
+ / official two-letter GEC (formerly FIPS) codes and NOT the ISO codes - you might be used to for country codes e.g. Austria is `au.json` and NOT `at.json`,
110
+ Germany is `gm.json` and NOT `de.json` so on)
169
111
 
170
112
 
171
113
 
data/Rakefile CHANGED
@@ -18,10 +18,10 @@ Hoe.spec 'factbook-readers' do
18
18
  self.history_file = 'CHANGELOG.md'
19
19
 
20
20
  self.extra_deps = [
21
- ['logutils' ],
22
- ['csvreader'],
21
+ ['factbook-codes' ],
22
+ ['factbook-fields' ],
23
23
  ['webget'],
24
- ['nokogiri'],
24
+ ## ['nokogiri'],
25
25
  ## ['activerecord'] # note: will include activesupport,etc.
26
26
  ]
27
27
 
@@ -1,13 +1,8 @@
1
- ## 3rd party gems/libs
2
- ## require 'props'
1
+ require 'factbook-fields'
3
2
 
4
- require 'logutils'
3
+ ## more 3rd party gems/libs
4
+ ## require 'props'
5
5
  require 'webget'
6
- require 'csvreader'
7
-
8
-
9
- require 'nokogiri'
10
-
11
6
 
12
7
 
13
8
 
@@ -15,40 +10,10 @@ require 'nokogiri'
15
10
  require 'factbook-readers/version' # let it always go first
16
11
 
17
12
 
18
- require 'factbook-readers/codes'
19
- require 'factbook-readers/comparisons'
20
-
21
-
22
- ## note: make codes, comparisons available
23
- module Factbook
24
- ## note: load on demand only builtin codes, comparisons, etc.
25
- ## for now
26
- def self.codes
27
- @@codes ||= Codes.read_csv( "#{Factbook::Module::Readers.root}/data/codes.csv" );
28
- end
29
- def self.comparisons
30
- @@comparisons ||= Comparisons.read_csv( "#{Factbook::Module::Readers.root}/data/comparisons.csv" )
31
- end
32
- end # module Factbook
33
13
 
34
-
35
-
36
- require 'factbook-readers/utils'
37
- require 'factbook-readers/utils_info'
38
- require 'factbook-readers/sanitizer'
39
- require 'factbook-readers/normalize'
40
- require 'factbook-readers/builder_item'
41
- require 'factbook-readers/builder'
42
- require 'factbook-readers/builder_json'
43
- require 'factbook-readers/page'
14
+ require 'factbook-readers/convert'
44
15
  require 'factbook-readers/page_info'
45
- require 'factbook-readers/sect'
46
- require 'factbook-readers/subsect'
47
-
48
-
49
- require 'factbook-readers/reader_json'
50
-
51
- require 'factbook-readers/table' ## e.g. TableReader
16
+ require 'factbook-readers/page'
52
17
 
53
18
  require 'factbook-readers/counter'
54
19
 
@@ -0,0 +1,37 @@
1
+
2
+
3
+ def convert_cia( cia )
4
+ ## convert from "raw" on-the-wire cia format to
5
+ ## "standard" compact "classic" format
6
+
7
+ data = {}
8
+
9
+ cia['categories'].each do |cia_cat|
10
+ cat = data[ cia_cat['title'] ] = {}
11
+ cia_cat['fields'].each do |cia_field|
12
+ field = cat[ cia_field['name'] ] = {}
13
+ if cia_field['subfields']
14
+ cia_field['subfields'].each do |cia_subfield|
15
+ subfield = field[ cia_subfield['name'] ] = {}
16
+ subfield[ 'text' ] = cia_subfield['content']
17
+ end
18
+
19
+ puts "== #{cia_cat['title']} / #{cia_field['name']} - skipping field content (w/ subfields):"
20
+ puts " >#{cia_field['content']}<"
21
+ puts " ?? same as:"
22
+ cia_field['subfields'].each do |cia_subfield|
23
+ puts " #{cia_subfield['name']}: >#{cia_subfield['content']}<"
24
+ end
25
+
26
+ else
27
+ field[ 'text' ] = cia_field['content']
28
+ end
29
+
30
+ if cia_field[ 'field_note' ]
31
+ field[ 'note' ] = cia_field[ 'field_note' ]
32
+ end
33
+ end
34
+ end
35
+
36
+ data
37
+ end
@@ -9,20 +9,20 @@ def initialize
9
9
  @data = {}
10
10
  end
11
11
 
12
- def count( page )
12
+ def count( code, page )
13
13
 
14
14
  ## walk page data hash
15
15
  # add nodes to data
16
16
 
17
- walk( page, page.data, @data )
17
+ walk( code, page.to_h, @data )
18
18
  end
19
19
 
20
20
 
21
21
  private
22
- def walk( page, hin, hout )
22
+ def walk( code, hin, hout )
23
23
  hin.each do |k,v|
24
- if v.is_a? Hash
25
- hout2 = hout[k] || { count: 0, codes: '' }
24
+ if v.is_a?( Hash )
25
+ hout2 = hout[k] ||= { count: 0, codes: '' }
26
26
 
27
27
  hout2[ :count ] += 1
28
28
 
@@ -32,12 +32,10 @@ def walk( page, hin, hout )
32
32
  codes = hout2[ :codes ]
33
33
  if codes ## note: might got deleted if passed treshhold (e.g. 9 entries)
34
34
  codes << ' ' unless codes.empty? ## add separator (space for now)
35
- codes << page.info.country_code
35
+ codes << code
36
36
  hout2[ :codes ] = codes
37
37
  end
38
-
39
- hout[k] = hout2
40
- walk( page, v, hout2 )
38
+ walk( code, v, hout2 )
41
39
  end
42
40
  end
43
41
  end
@@ -5,23 +5,12 @@ module Factbook
5
5
  class Page
6
6
  include LogUtils::Logging
7
7
 
8
- attr_reader :sects ## "structured" access e.g. sects/subsects/etc.
9
- attr_reader :info ## meta info e.g. country_code, country_name, region_name, last_updated, etc.
10
- attr_reader :data ## "plain" access with vanilla hash
11
-
8
+ attr_reader :info ## meta info e.g. country_code, country_name, region_name, updated, etc.
12
9
 
13
10
  ## standard version (note: requires https)
14
- SITE_BASE = 'https://www.cia.gov/library/publications/the-world-factbook/geos/{code}.html'
15
-
11
+ SITE_BASE = "https://www.cia.gov/the-world-factbook/geos/{code}.json"
16
12
 
17
- def self.parse( html ) ## parse html from string
18
- new( html: html )
19
- end
20
13
 
21
- def self.read( path )
22
- html = File.open( path, 'r:utf-8' ) { |f| f.read }
23
- new( html: html )
24
- end
25
14
 
26
15
  def self.parse_json( json ) ## parse json from string
27
16
  new( json: json )
@@ -36,87 +25,78 @@ class Page
36
25
  new( code, cache: cache )
37
26
  end
38
27
 
39
- ## some convenience alias(es)
40
- class << self
41
- alias_method :read_html, :read
42
- alias_method :parse_html, :parse
43
- end
44
28
 
45
29
 
46
30
  def initialize( code=nil,
47
31
  json: nil,
48
- html: nil,
49
32
  cache: false,
50
33
  info: nil )
51
34
  if json
52
35
  ## note: assumes json is (still) a string/text
53
36
  ## (NOT yet parsed to structured data)
54
- b = JsonBuilder.new( json )
55
- else ## assume html
56
- if html
57
- ## for debugging and testing allow "custom" passed-in html page
58
- else
37
+ b = ProfileBuilder.new( json )
38
+ else ## assume "raw" json dataset
59
39
  ## allow passing in code struct too - just use/pluck two-letter code from struct !!!
60
40
  code = code.code if code.is_a?( Codes::Code )
61
41
 
62
42
  raise ArgumentError, "two letter code (e.g. au) required to download page & build page url" if code.nil?
63
43
  url = SITE_BASE.sub( '{code}', code )
64
44
 
65
- html = if cache && Webcache.exist?( url )
66
- Webcache.read( url ) ## for debugging - read from cache
67
- else
68
- download_page( url )
69
- end
70
- end
71
- b = Builder.new( html )
45
+ raw_data = if cache && Webcache.exist?( url )
46
+ text = Webcache.read( url ) ## for debugging - read from cache
47
+ JSON.parse( text )
48
+ else
49
+ download_data( url )
50
+ end
51
+
52
+ ## meta info from raw date - example:
53
+ ## "name": "Aruba",
54
+ ## "code": "AA",
55
+ ## "region": "Central America",
56
+ ## "published": "2021-01-25 09:07:08 -0500",
57
+ ## "updated": "2021-01-22 14:38:14 -0500",
58
+ ##
59
+ ## note: published is NOT before updated (like an alias for created) BUT is often older/later than updated - why!?
60
+
61
+ @info = PageInfo.new
62
+
63
+ @info.country_code = raw_data['code'].downcase
64
+ @info.country_name = raw_data['name']
65
+ @info.region_name = raw_data['region']
66
+
67
+ ## note: just parse year,month,day for now (skip hours,minutes,etc.)
68
+ @info.published = Date.strptime( raw_data['published'], '%Y-%m-%d' )
69
+ @info.updated = Date.strptime( raw_data['updated'], '%Y-%m-%d' )
70
+
71
+ data = convert_cia( raw_data )
72
+ b = ProfileBuilder.new( data )
72
73
  end
73
74
 
74
- @sects = b.sects
75
- @info = b.info
75
+ @profile = b.profile
76
76
 
77
77
  ## todo/fix/quick hack:
78
78
  ## check for info opts - lets you overwrite page info
79
79
  ## -- use proper header to setup page info - why, why not??
80
80
  @info = info if info
81
-
82
-
83
- @data = {}
84
- @sects.each do |sect|
85
- @data[ sect.title ] = sect.data
86
- end
87
81
  end
88
82
 
89
83
 
90
- def to_json( minify: false ) ## convenience helper for data.to_json; note: pretty print by default!
91
- if minify
92
- data.to_json
93
- else ## note: pretty print by default!
94
- JSON.pretty_generate( data )
95
- end
96
- end
97
84
 
98
-
99
- def [](key) ### convenience shortcut
100
- # lets you use
101
- # page['geo']
102
- # instead of
103
- # page.data['geo']
104
-
105
- ## fix: use delegate data, [] from forwardable lib - why?? why not??
106
-
107
- data[key]
108
- end
85
+ ## convenience helpers - forward to profile
86
+ def [](key) @profile[key]; end
87
+ def to_h() @profile.to_h; end
88
+ def to_json( minify: false ) @profile.to_json( minify: minify ); end
89
+ def size() @profile.size; end
109
90
 
110
91
 
111
92
  private
112
- def download_page( url )
113
- response = Webget.page( url )
93
+ def download_data( url )
94
+ response = Webget.call( url )
114
95
 
115
96
  ## note: exit on get / fetch error - do NOT continue for now - why? why not?
116
97
  exit 1 if response.status.nok? ## e.g. HTTP status code != 200
117
98
 
118
-
119
- response.text
99
+ response.json
120
100
  end
121
101
  end # class Page
122
102
  end # module Factbook