factbook 2.0.0 → 2.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (68) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +1 -1
  3. data/Manifest.txt +0 -61
  4. data/README.md +8 -506
  5. data/Rakefile +4 -9
  6. data/lib/factbook.rb +4 -64
  7. metadata +6 -124
  8. data/data/attributes.yml +0 -337
  9. data/data/categories.csv +0 -164
  10. data/data/codes.csv +0 -262
  11. data/data/codesxref.csv +0 -280
  12. data/data/comparisons.csv +0 -75
  13. data/lib/factbook/almanac.rb +0 -72
  14. data/lib/factbook/attributes.rb +0 -74
  15. data/lib/factbook/builder.rb +0 -212
  16. data/lib/factbook/builder_item.rb +0 -126
  17. data/lib/factbook/builder_json.rb +0 -79
  18. data/lib/factbook/codes.rb +0 -119
  19. data/lib/factbook/comparisons.rb +0 -50
  20. data/lib/factbook/counter.rb +0 -48
  21. data/lib/factbook/db/importer.rb +0 -92
  22. data/lib/factbook/db/models.rb +0 -11
  23. data/lib/factbook/db/schema.rb +0 -36
  24. data/lib/factbook/normalize.rb +0 -43
  25. data/lib/factbook/page.rb +0 -148
  26. data/lib/factbook/page_info.rb +0 -12
  27. data/lib/factbook/reader_json.rb +0 -51
  28. data/lib/factbook/sanitizer.rb +0 -178
  29. data/lib/factbook/sect.rb +0 -29
  30. data/lib/factbook/subsect.rb +0 -18
  31. data/lib/factbook/table.rb +0 -52
  32. data/lib/factbook/utils.rb +0 -85
  33. data/lib/factbook/utils_info.rb +0 -129
  34. data/lib/factbook/version.rb +0 -21
  35. data/script/almanac.rb +0 -48
  36. data/script/attributes.rb +0 -34
  37. data/script/build.rb +0 -28
  38. data/script/counter.rb +0 -145
  39. data/script/json.rb +0 -19
  40. data/script/testbr.rb +0 -33
  41. data/script/testcodes.rb +0 -11
  42. data/test/data/au.html +0 -579
  43. data/test/data/au.yml +0 -8
  44. data/test/data/be.html +0 -596
  45. data/test/data/be.yml +0 -8
  46. data/test/data/json/au.json +0 -892
  47. data/test/data/src/ag.html +0 -716
  48. data/test/data/src/au-2015-09-24.html +0 -2006
  49. data/test/data/src/au.html +0 -658
  50. data/test/data/src/be-2015-09-24.html +0 -2011
  51. data/test/data/src/be.html +0 -648
  52. data/test/helper.rb +0 -11
  53. data/test/test_attribs.rb +0 -87
  54. data/test/test_attribs_def.rb +0 -20
  55. data/test/test_builder.rb +0 -35
  56. data/test/test_codes.rb +0 -76
  57. data/test/test_comparisons.rb +0 -19
  58. data/test/test_convert.rb +0 -30
  59. data/test/test_counter.rb +0 -31
  60. data/test/test_fields.rb +0 -52
  61. data/test/test_importer.rb +0 -56
  62. data/test/test_item_builder.rb +0 -99
  63. data/test/test_json.rb +0 -45
  64. data/test/test_json_builder.rb +0 -25
  65. data/test/test_normalize.rb +0 -23
  66. data/test/test_page.rb +0 -38
  67. data/test/test_sanitizer.rb +0 -39
  68. data/test/test_sanitizer_regex.rb +0 -89
@@ -1,79 +0,0 @@
1
- # encoding: utf-8
2
-
3
- module Factbook
4
-
5
- ######
6
- # json builder -- lets us rebuild a page from "dumped" json (instead of parsing html page)
7
-
8
- class JsonBuilder
9
- include LogUtils::Logging
10
- include NormalizeHelper ## e.g. normalize_category
11
-
12
-
13
- def self.from_file( path )
14
- text = File.read( path ) ## fix: use File.read_utf8 from textutils
15
- self.from_string( text )
16
- end
17
-
18
- def self.from_string( text )
19
- self.new( text )
20
- end
21
-
22
-
23
- attr_reader :text,
24
- :json,
25
- :info, ## not used yet -- page info incl. country_name, region_name, last_updated etc.
26
- :errors, ## not used yet -- encoding erros etc.
27
- :sects
28
-
29
-
30
- def initialize( text )
31
- @text = text
32
-
33
- @json = JSON.parse( text )
34
-
35
- @info = nil ## fix/todo: sorry - for now no page info (use header in json - why? why not??)
36
- @errors = [] ## fix/todo: sorry - for now no errors possible/tracked
37
-
38
- @sects = []
39
-
40
- @json.each do |k1,v1|
41
- sect_title = k1
42
- sect_subsects = v1
43
-
44
- sect = Sect.new
45
- sect.title = sect_title
46
-
47
- ## get subsections
48
- subsects = []
49
- sect_subsects.each do |k2,v2|
50
- subsect_title = k2
51
- subsect_data = v2
52
-
53
- subsect = Subsect.new
54
- subsect.title = subsect_title
55
-
56
- #####
57
- ## note: run data hash through normalize_category (again)
58
- if subsect_data.is_a?( Hash )
59
- new_subsect_data = {}
60
- subsect_data.each do |k3,v3|
61
- new_subsect_data[ normalize_category(k3) ] = v3
62
- end
63
- subsect_data = new_subsect_data
64
- end
65
-
66
- subsect.data = subsect_data
67
-
68
- subsects << subsect
69
- end
70
-
71
- sect.subsects = subsects
72
- @sects << sect
73
- end
74
- end
75
-
76
- end # class JsonBuilder
77
-
78
-
79
- end # module Factbook
@@ -1,119 +0,0 @@
1
- # encoding: utf-8
2
-
3
- ##
4
- # note:
5
- # the factbook category/region for world is other entities (on FAQ) and oceans in page
6
- # changed to world
7
-
8
-
9
- module Factbook
10
-
11
- class Codes
12
-
13
- Code = Struct.new( :code, ## todo: add notes (country affiliation) - why? why not??
14
- :name,
15
- :category, ## e.g. Countries, Other, Oceans, World, Dependencies, etc.
16
- :region, ## e.g. Europe, Oceans, etc.
17
- )
18
-
19
- def self.from_csv( path )
20
- ###
21
- # note:
22
- # if you use quotes - NO leading spaces allowed e.g.
23
- # use au,"Austria",... and NOT
24
- # au, "Austria", ...
25
- #
26
- # for headers - NO leading spaces allowed e.g.
27
- # use Code,Name,Category,Region,... and NOT
28
- # Code, Name, Category, Region, ...
29
-
30
- rows = CSV.read( path, headers: true )
31
-
32
- pp rows
33
-
34
- recs = []
35
- rows.each do |row|
36
- pp row
37
- rec = Code.new
38
- rec.code = row['Code'].strip ## remove leading n trailing whitespaces
39
- rec.name = row['Name'].strip
40
-
41
- ## note: for now category and region are optional
42
- rec.category = row['Category'].strip if row['Category']
43
- rec.region = row['Region'].strip if row['Region']
44
-
45
- pp rec
46
- recs << rec
47
- end
48
-
49
- self.new( recs )
50
- end
51
-
52
- def initialize( codes )
53
- @codes = codes
54
- end
55
-
56
- def size() @codes.size; end
57
-
58
- def each
59
- @codes.each {|code| yield( code ) }
60
- end
61
-
62
- def to_a
63
- @codes.collect {|code| code.code } ## return array of codes
64
- end
65
-
66
- ## def all() self.to_a; end ## note: alias for to_a - use - why? why not??
67
-
68
- ## "pre-defined" convenience shortcuts
69
- def countries() category 'Countries'; end
70
- def world() category 'World'; end
71
- def oceans() category 'Oceans'; end
72
- def misc() category 'Miscellaneous'; end
73
- def others() category 'Other'; end
74
- def dependencies() category 'Dependencies'; end
75
- def dependencies_us() category 'Dependencies (United States)'; end
76
- ## fix/todo: add all dependencies uk (or gb?), fr,cn,au,nz,no,dk,etc.
77
-
78
- def europe() region 'Europe'; end
79
- def south_asia() region 'South Asia'; end
80
- def central_asia() region 'Central Asia'; end
81
- def east_n_souteast_asia() region 'East & Southeast Asia'; end
82
- def middle_east() region 'Middle East'; end
83
- def africa() region 'Africa'; end
84
- def north_america() region 'North America'; end
85
- def central_america_n_caribbean() region 'Central America and Caribbean'; end
86
- def south_america() region 'South America'; end
87
- def australia_oceania() region 'Australia-Oceania'; end
88
- def antartica() region 'Antarctica'; end
89
-
90
- ## note: regions oceans and world - same as category oceans and world
91
- ## use oceans_ii or world_ii or something ??
92
- ## use category('World') n region('World')
93
- ## use category('Oceans') n region('Oceans')
94
-
95
-
96
- def category( query )
97
- ## todo/future: allow passing in of regex too (not just string)
98
- ## note: e.g. Dependencies (France) needs to get escpaed to
99
- ## Dependencies \(France\) etc.
100
- filter_regex = /#{Regexp.escape(query)}/i
101
- codes = @codes.select do |code|
102
- code.category ? filter_regex.match( code.category ) : false ## note: allow nil for category; will fail on search
103
- end
104
- Codes.new( codes ) ## return new Codes obj for easy-chaining
105
- end
106
-
107
- def region( query )
108
- ## todo/future: allow passing in of regex too (not just string)
109
- filter_regex = /#{Regexp.escape(query)}/i
110
- codes = @codes.select do |code|
111
- code.region ? filter_regex.match( code.region ) : false ## note: allow nil for region; will fail on search
112
- end
113
- Codes.new( codes ) ## return new Codes obj for easy-chaining
114
- end
115
-
116
- end # class codes
117
-
118
- end # module Factbook
119
-
@@ -1,50 +0,0 @@
1
- # encoding: utf-8
2
-
3
- module Factbook
4
-
5
- class Comparisons
6
-
7
- Comparison = Struct.new( :num, ### todo: use no or id or something - why? why not?
8
- :category, ## e.g. Geography, People, Economy, etc.
9
- :name,
10
- )
11
-
12
- def self.from_csv( path )
13
-
14
- rows = CSV.read( path, headers: true )
15
-
16
- pp rows
17
-
18
- recs = []
19
- rows.each do |row|
20
- pp row
21
- rec = Comparison.new
22
- rec.num = row['Num'].strip.to_i ## remove leading n trailing whitespaces
23
- rec.category = row['Category'].strip
24
- rec.name = row['Name'].strip
25
-
26
- pp rec
27
- recs << rec
28
- end
29
-
30
- self.new( recs )
31
- end
32
-
33
- def initialize( comps )
34
- @comps = comps
35
- end
36
-
37
- def size() @comps.size; end
38
-
39
- def each
40
- @comps.each {|comp| yield( comp ) }
41
- end
42
-
43
- def to_a
44
- @comps.collect {|comp| comp.num } ## return array of nums -- return something else - why? why not?
45
- end
46
-
47
- end # class Comparison
48
-
49
- end # module Factbook
50
-
@@ -1,48 +0,0 @@
1
- # encoding: utf-8
2
-
3
- module Factbook
4
-
5
- class Counter
6
-
7
- attr_reader :data
8
-
9
- def initialize
10
- @data = {}
11
- end
12
-
13
- def count( page )
14
-
15
- ## walk page data hash
16
- # add nodes to data
17
-
18
- walk( page, page.data, @data )
19
- end
20
-
21
-
22
- private
23
- def walk( page, hin, hout )
24
- hin.each do |k,v|
25
- if v.is_a? Hash
26
- hout2 = hout[k] || { count: 0, codes: '' }
27
-
28
- hout2[ :count ] += 1
29
-
30
- ## delete codes if larger (treshhold) than x (e.g. 9)
31
- hout2.delete( :codes ) if hout2[ :count ] > 9
32
-
33
- codes = hout2[ :codes ]
34
- if codes ## note: might got deleted if passed treshhold (e.g. 9 entries)
35
- codes << ' ' unless codes.empty? ## add separator (space for now)
36
- codes << page.info.country_code
37
- hout2[ :codes ] = codes
38
- end
39
-
40
- hout[k] = hout2
41
- walk( page, v, hout2 )
42
- end
43
- end
44
- end
45
-
46
- end # class Counter
47
-
48
- end # module Factbook
@@ -1,92 +0,0 @@
1
- # encoding: utf-8
2
-
3
- module Factbook
4
-
5
- class Importer
6
-
7
- def import( page )
8
-
9
- ## note: assumes active connection
10
-
11
- code = page.info.country_code
12
- name = page.info.country_name
13
-
14
- attribs = {
15
- name: name,
16
- area: sq_km( page.area ), # e.g. 83,871 sq km
17
- area_land: sq_km( page.area_land ), # e.g. 82,445 sq km
18
- area_water: sq_km( page.area_water ), # e.g. 1,426 sq km
19
-
20
- population: num( page.population ), # e.g. 8,665,550 (July 2015 est.)
21
- population_growth: percent( page.population_growth ), # e.g. 0.55% (2015 est.)
22
- birth_rate: rate_per_thousand( page.birth_rate ), # e.g. 9.41 births/1,000 population (2015 est.)
23
- death_rate: rate_per_thousand( page.death_rate ), # e.g. 9.42 deaths/1,000 population (2015 est.)
24
- migration_rate: rate_per_thousand( page.migration_rate ), # e.g. 5.56 migrant(s)/1,000 population (2015 est.)
25
- }
26
-
27
- rec = Fact.find_by( code: code )
28
- if rec.nil? ## create (new) record
29
- rec = Fact.new
30
- attribs[ :code ] = code
31
- puts "create fact record #{code}/#{name}:"
32
- else ## update (exisiting) record
33
- puts "update fact record #{code}/#{name}:"
34
- end
35
-
36
- puts " #{attribs.inspect}"
37
- rec.update_attributes!( attribs )
38
- end
39
-
40
-
41
- def rate_per_thousand( text )
42
- # e.g. 9.41 births/1,000 population (2015 est.)
43
- # 9.42 deaths/1,000 population (2015 est.)
44
- # 5.56 migrant(s)/1,000 population (2015 est.)
45
-
46
- if text =~/([0-9\.]+) [a-z\(\)]+\/1,000/
47
- $1.to_f
48
- else
49
- puts "*** warn: unknown rate <name>/1,000 format (no match): >#{text}<"
50
- nil
51
- end
52
- end
53
-
54
- def num( text )
55
- # e.g. 8,665,550 (July 2015 est.)
56
-
57
- if text =~/([0-9,\.]+)/
58
- $1.gsub(',', '').to_i ## note: remove commas (,) if present
59
- else
60
- puts "*** warn: unknown number format (no match): >#{text}<"
61
- nil ## return nil
62
- end
63
- end
64
-
65
- def percent( text )
66
- # e.g. 0.55% (2015 est.)
67
-
68
- if text =~/([0-9\.]+)%/
69
- $1.to_f
70
- else
71
- puts "*** warn: unknown percent format (no match): >#{text}<"
72
- nil ## return nil
73
- end
74
- end
75
-
76
- def sq_km( text )
77
- # e.g. 83,871 sq km
78
- ## todo - check vatican - uses float e.g. 0.44 ?? add support?
79
-
80
- if text =~/([0-9,\.]+) sq km/
81
- $1.gsub(',', '').to_i ## note: remove commas (,) if present
82
- else
83
- puts "*** warn: unknown sq km format (no match): >#{text}<"
84
- nil ## return nil
85
- end
86
- end
87
-
88
-
89
- end # class Importer
90
-
91
- end # module Factbook
92
-
@@ -1,11 +0,0 @@
1
- # encoding: utf-8
2
-
3
- module Factbook
4
-
5
-
6
- class Fact < ActiveRecord::Base
7
-
8
- end # class Fact
9
-
10
-
11
- end # module Factbook
@@ -1,36 +0,0 @@
1
- # encoding: utf-8
2
-
3
- module Factbook
4
-
5
- class CreateDb
6
-
7
- def up
8
-
9
- ActiveRecord::Schema.define do
10
-
11
- create_table :facts do |t|
12
- t.string :code, null: false # country code e.g. au
13
- t.string :name, null: false # country name e.g. Austria
14
-
15
- t.integer :area # e.g. 83,871 sq km
16
- t.integer :area_land # e.g. 82,445 sq km --use float - why? why not?
17
- t.integer :area_water # e.g. 1,426 sq km
18
-
19
- t.integer :population # e.g. 8,665,550 (July 2015 est.)
20
- t.float :population_growth # e.g. 0.55% (2015 est.)
21
- t.float :birth_rate # e.g. 9.41 births/1,000 population (2015 est.)
22
- t.float :death_rate # e.g. 9.42 deaths/1,000 population (2015 est.)
23
- t.float :migration_rate # e.g. 5.56 migrant(s)/1,000 population (2015 est.)
24
-
25
- t.timestamps
26
- end
27
-
28
-
29
- end # block Schema.define
30
-
31
- end # method up
32
-
33
-
34
- end # class CreateDb
35
-
36
- end # module Factbook
@@ -1,43 +0,0 @@
1
- # encoding: utf-8
2
-
3
- module Factbook
4
- module NormalizeHelper
5
-
6
-
7
- def normalize_category( text )
8
-
9
- ## note: fix typos/errors with double colons e.g. note:: (instead of note:)
10
-
11
- text = text.strip
12
- text = text.sub( /:+\z/, '' ) # remove trailing : if present -- note: allow (fix) note:: too, thus, use :+
13
- text = text.strip
14
-
15
- #######################################
16
- ### special cases
17
-
18
- ## typos e.g ntoe => use note
19
- text = 'note' if text == 'ntoe'
20
- text = 'investment in fixed capital' if text == 'investment if fixed capital'
21
-
22
- ## downcase
23
- text = 'lowest point' if text == 'Lowest point'
24
- text = 'chief of state' if text == 'Chief of state'
25
-
26
- ## spelling variant (use more popular one)
27
- text = 'signed, but not ratified' if text == 'signed but not ratified'
28
- text = 'vectorborne diseases' if text == 'vectorborne disease'
29
- text = 'water contact disease' if text == 'water contact diseases'
30
- text = 'food or waterborne diseases' if text == 'food or waterborne disease'
31
- text = 'geographic coordinates' if text == 'geographical coordinates'
32
- text = 'note' if text == 'notes'
33
- text = 'refugees (country of origin)' if text == 'refugees (countries of origin)'
34
-
35
- ## border countries (8): -- remove (x) counter
36
- text = 'border countries' if text.start_with?( 'border countries')
37
-
38
- text
39
- end
40
-
41
-
42
- end # module NormalizeHelper
43
- end # module Factbook