factbook 1.1.1 → 2.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (64) hide show
  1. checksums.yaml +4 -4
  2. data/{HISTORY.md → CHANGELOG.md} +3 -3
  3. data/Manifest.txt +1 -58
  4. data/README.md +50 -575
  5. data/Rakefile +29 -33
  6. data/lib/factbook.rb +8 -75
  7. metadata +20 -114
  8. data/data/attributes.yml +0 -337
  9. data/data/categories.csv +0 -164
  10. data/data/codes.csv +0 -262
  11. data/data/codesxref.csv +0 -280
  12. data/data/comparisons.csv +0 -75
  13. data/lib/factbook/almanac.rb +0 -72
  14. data/lib/factbook/attributes.rb +0 -74
  15. data/lib/factbook/builder.rb +0 -214
  16. data/lib/factbook/builder_item.rb +0 -92
  17. data/lib/factbook/builder_json.rb +0 -79
  18. data/lib/factbook/codes.rb +0 -119
  19. data/lib/factbook/comparisons.rb +0 -50
  20. data/lib/factbook/counter.rb +0 -48
  21. data/lib/factbook/db/importer.rb +0 -92
  22. data/lib/factbook/db/models.rb +0 -11
  23. data/lib/factbook/db/schema.rb +0 -36
  24. data/lib/factbook/normalize.rb +0 -43
  25. data/lib/factbook/page.rb +0 -185
  26. data/lib/factbook/page_info.rb +0 -12
  27. data/lib/factbook/reader_json.rb +0 -51
  28. data/lib/factbook/sanitizer.rb +0 -207
  29. data/lib/factbook/sect.rb +0 -29
  30. data/lib/factbook/subsect.rb +0 -18
  31. data/lib/factbook/table.rb +0 -52
  32. data/lib/factbook/utils.rb +0 -85
  33. data/lib/factbook/utils_info.rb +0 -102
  34. data/lib/factbook/version.rb +0 -22
  35. data/script/almanac.rb +0 -48
  36. data/script/attributes.rb +0 -34
  37. data/script/build.rb +0 -28
  38. data/script/counter.rb +0 -145
  39. data/script/json.rb +0 -18
  40. data/script/testbr.rb +0 -33
  41. data/script/testcodes.rb +0 -11
  42. data/test/data/au.html +0 -579
  43. data/test/data/au.yml +0 -8
  44. data/test/data/be.html +0 -596
  45. data/test/data/be.yml +0 -8
  46. data/test/data/json/au.json +0 -892
  47. data/test/data/src/au.html +0 -2006
  48. data/test/data/src/be.html +0 -2011
  49. data/test/helper.rb +0 -11
  50. data/test/test_attribs.rb +0 -82
  51. data/test/test_attribs_def.rb +0 -20
  52. data/test/test_builder.rb +0 -35
  53. data/test/test_codes.rb +0 -76
  54. data/test/test_comparisons.rb +0 -19
  55. data/test/test_convert.rb +0 -30
  56. data/test/test_counter.rb +0 -31
  57. data/test/test_fields.rb +0 -52
  58. data/test/test_importer.rb +0 -55
  59. data/test/test_item_builder.rb +0 -99
  60. data/test/test_json.rb +0 -44
  61. data/test/test_json_builder.rb +0 -25
  62. data/test/test_normalize.rb +0 -23
  63. data/test/test_page.rb +0 -38
  64. data/test/test_sanitizer.rb +0 -35
@@ -1,29 +0,0 @@
1
- # encoding: utf-8
2
-
3
- module Factbook
4
-
5
-
6
- class Sect
7
- include LogUtils::Logging
8
-
9
- attr_accessor :title ## use name instead of title - why? why not?
10
- attr_accessor :subsects
11
-
12
- def initialize
13
- @subsects = []
14
- end
15
-
16
- def data
17
- ## convert sects to hash
18
- @data = {}
19
-
20
- subsects.each_with_index do |subsect,i|
21
- @data[ subsect.title ] = subsect.data
22
- end
23
- @data
24
- end
25
-
26
-
27
- end # class Sect
28
-
29
- end # module Factbook
@@ -1,18 +0,0 @@
1
- # encoding: utf-8
2
-
3
- module Factbook
4
-
5
-
6
- class Subsect
7
- include LogUtils::Logging
8
-
9
- attr_accessor :title ## use name instead of title - why? why not?
10
- attr_accessor :data ## hash holding data e.g. { 'text' => '...' etc. }
11
-
12
- def initialize
13
- @data = {}
14
- end
15
-
16
- end # class Subsect
17
-
18
- end # module Factbook
@@ -1,52 +0,0 @@
1
- # encoding: utf-8
2
-
3
- module Factbook
4
-
5
- ##
6
- ## make more "generic" - why? why not?
7
- ## (re)use for other files ?? move to textutils ??
8
-
9
- ##
10
- ## for now reads in rows with values separated by at least 3+ spaces e.g.:
11
- ## see www.cia.gov/library/publications/the-world-factbook/rankorder/rawdata_2119.txt
12
- ## 1 China 1,367,485,388
13
- ## 2 India 1,251,695,584
14
- ## 3 European Union 513,949,445
15
- ## 4 United States 321,368,864
16
- ## 5 Indonesia 255,993,674
17
- ## 6 Brazil 204,259,812
18
-
19
-
20
- class TableReader
21
- include LogUtils::Logging
22
-
23
-
24
- def initialize( text )
25
- @text = text
26
- end
27
-
28
- def read
29
- recs = []
30
-
31
- line_no = 0
32
- @text.each_line do |line|
33
- line_no +=1
34
- line = line.strip ## remove leading and trailing whitespace
35
- if line.empty?
36
- puts "** skipping empty line #{line_no}"
37
- next
38
- end
39
-
40
- values = line.split( /[ ]{3,}/ ) ## split three or more spaces - use just two ?? why? why not??
41
-
42
- ## puts line
43
- ## pp values
44
- recs << values
45
- end
46
- recs
47
- end
48
-
49
-
50
- end # class TableReader
51
-
52
- end # module Factbook
@@ -1,85 +0,0 @@
1
- # encoding: utf-8
2
-
3
- module Factbook
4
- module Utils
5
-
6
- ########################################
7
- ## todo: move to textutils - why, why not ?????
8
-
9
- def encode_utf8( text )
10
-
11
- errors = [] ## also return list of encoding errors
12
-
13
- ## note: factbook claims utf-8 - but includes invalid bytes in some pages
14
- ## encoding is likley wester/windows-
15
-
16
- ## note:
17
- ## use � - unknown/invalid unicode char
18
- ## fix/todo: use ASCII-8BIT instead of binnary
19
- text = text.encode('UTF-8', 'binary', :invalid => :replace,
20
- :undef => :replace,
21
- :replace => '�' )
22
-
23
- ## check for replaced/invalid chars and log warrning
24
- pos = text.index( '�' )
25
- while pos
26
- from = pos-10 ## tood/fix: use min/max to check for bounds - why? why not??
27
- to = pos+10
28
- around = text[from..to]
29
- puts " pos #{pos}, from #{from}, to #{to}, around >#{around}<"
30
- msg = "invalid char on pos #{pos} around: >#{around}<"
31
- puts msg
32
- ## also log message / w timestamp
33
-
34
- errors << "#{Time.now} - #{msg}"
35
-
36
- pos = text.index( '�', pos+1 )
37
- end
38
-
39
- [text,errors] ## return text and errors (list)
40
- end
41
-
42
-
43
-
44
- def values_to_csv( values )
45
- buf = ""
46
- values.each_with_index do |value,i|
47
- buf << ',' if i > 0 ## add comma (except for first value)
48
- ## note: allow optional $ sign e.g. $100,000,000
49
- ## !!!! todo/fix: allow optional minus e.g. -44,000
50
- if value =~ /^\$?[1-9][,0-9]+[0-9]$/ ### find a better regex - why? why not??
51
- ## check if number e.g. 17,098,242 or $17,098,242
52
- ## remove commas 17098242
53
- buf << value.gsub( ',', '' )
54
- elsif value.index( ',').nil?
55
- ## add as is 1:1 (no commana)
56
- buf << value
57
- else
58
- ## escape comma with double quote
59
- # e.g. Guam, The becomes "Guam, The"
60
- buf << '"'
61
- buf << value
62
- buf << '"'
63
- end
64
- end
65
- buf
66
- end
67
-
68
-
69
- def data_to_csv( recs, headers )
70
- text = ""
71
-
72
- text << values_to_csv( headers )
73
- text << "\n"
74
-
75
- recs.each do |rec|
76
- text << values_to_csv( rec )
77
- text << "\n"
78
- end
79
-
80
- text
81
- end
82
-
83
-
84
- end # module Utils
85
- end # module Factbook
@@ -1,102 +0,0 @@
1
- # encoding: utf-8
2
-
3
- module Factbook
4
- module Utils
5
-
6
- #######
7
- ## find meta data (about page info)
8
-
9
-
10
- #### e.g. Page last updated on September 16, 2015
11
-
12
- MONTH_EN_TO_S={
13
- 'January' => '1',
14
- 'February' => '2',
15
- 'March' => '3',
16
- 'April' => '4',
17
- 'May' => '5',
18
- 'June' => '6',
19
- 'July' => '7',
20
- 'August' => '8',
21
- 'September' => '9',
22
- 'October' => '10',
23
- 'November' => '11',
24
- 'December' => '12'
25
- }
26
-
27
- PAGE_LAST_UPDATED_REGEX = /
28
- Page \s last \s updated \s on \s
29
- (?<month_en>[a-z]+) \s
30
- (?<day>\d{1,2}), \s
31
- (?<year>\d{4})
32
- /imx
33
-
34
- def find_page_last_updated( html )
35
- m = PAGE_LAST_UPDATED_REGEX.match( html )
36
- if m
37
- pp m
38
- month_en = m[:month_en]
39
- day = m[:day]
40
- year = m[:year]
41
- puts "** bingo - month #{month_en}, day #{day}, year #{year}"
42
-
43
- month = MONTH_EN_TO_S[ month_en ]
44
- date_str = "#{year}-#{month}-#{day}"
45
- pp date_str
46
- date = Date.strptime( date_str, '%Y-%m-%d' )
47
- date
48
- else
49
- nil
50
- end
51
- end
52
-
53
- ##
54
- ## e.g. regioncode="eur"
55
- ## countrycode="au"
56
- ## countryname="Austria"
57
- ## flagsubfield=""
58
- ## countryaffiliation=""
59
- ## flagdescription=""
60
- ## flagdescriptionnote=""
61
- ## region="Europe"
62
- ##
63
- ## note: countryaffiliation may be empty
64
-
65
-
66
-
67
- PAGE_INFO_REGEX = /
68
- regioncode=(?<q1>"|')(?<region_code>.+?)\k<q1>
69
- \s+
70
- countrycode=(?<q2>"|')(?<country_code>.+?)\k<q2> ## is k<3> backref
71
- \s+
72
- countryname=(?<q3>"|')(?<country>.+?)\k<q3>
73
- \s+
74
- [^>]+? ## allow any attribs (note: non-greedy)
75
- countryaffiliation=(?<q4>"|')(?<affiliation>.*?)\k<q4> ## note: might be empty
76
- \s+
77
- [^>]+? ## allow any attribs (note: non-greedy)
78
- region=(?<q5>"|')(?<region>.+?)\k<q5> ## check world - might be empty ?? or for ocean ??
79
- /imx
80
-
81
-
82
- def find_page_info( html )
83
- m = PAGE_INFO_REGEX.match( html )
84
- if m
85
- pp m
86
-
87
- h = { country_code: m[:country_code],
88
- country_name: m[:country],
89
- country_affiliation: m[:affiliation],
90
- region_code: m[:region_code],
91
- region_name: m[:region] }
92
-
93
- puts "** bingo - #{h.inspect}"
94
- h ## return hash w/ name-value pairs
95
- else
96
- nil ## or return empty struct with nils/empty strings - why?? why not??
97
- end
98
- end
99
-
100
-
101
- end # module Utils
102
- end # module Factbook
@@ -1,22 +0,0 @@
1
- # encoding: utf-8
2
-
3
- module Factbook
4
-
5
- MAJOR = 1
6
- MINOR = 1
7
- PATCH = 1
8
- VERSION = [MAJOR,MINOR,PATCH].join('.')
9
-
10
- def self.version
11
- VERSION
12
- end
13
-
14
- def self.banner
15
- "factbook/#{VERSION} on Ruby #{RUBY_VERSION} (#{RUBY_RELEASE_DATE}) [#{RUBY_PLATFORM}]"
16
- end
17
-
18
- def self.root
19
- "#{File.expand_path( File.dirname(File.dirname(File.dirname(__FILE__))) )}"
20
- end
21
-
22
- end
@@ -1,48 +0,0 @@
1
- # encoding: utf-8
2
- #
3
- # use to run:
4
- # ruby -I ./lib script/almanac.rb
5
-
6
-
7
- require 'factbook'
8
-
9
-
10
- TEMPLATE = <<EOS
11
-
12
- ### <%= names %>
13
-
14
- <%= page.name_long=='none' ? '\-' : page.name_long %> › <%= page.map %> -- <%= page.location %> <br>
15
- <%= page.capital %> • <%= page.area %> • pop. <%= page.population %>
16
-
17
- **Languages:** <%= page.languages %>
18
- **Major cities:** <%= page.major_cities %>
19
- **Ethnic groups:** <%= page.ethnic_groups %>
20
- **Religions:** <%= page.religions %>
21
- **Independence:** <%= page.independence %>
22
-
23
- **Internet:** `<%= page.internet %>` • <%= page.internet_users %> • <%= page.internet_users_rate %>
24
- **Telephones - mobile:** <%= page.telephones_mobile %> • <%= page.telephones_mobile_subscriptions %> subs./100
25
-
26
- EOS
27
-
28
-
29
- #########################
30
- ### read all countries
31
- ### using local json (dump) files
32
-
33
- ## see github.com/factbook/factbook.json (use git clone)
34
- json_dir = '../../factbook/factbook.json'
35
- codes = Factbook.codes.countries
36
- ## todo: add tawain and ?? from others - why, why not??
37
-
38
- pages = Factbook::JsonPageReader.new( json_dir ).read_pages( codes )
39
-
40
- almanac = Factbook::Almanac.new( pages )
41
-
42
- ## save to disk
43
-
44
- File.open( './tmp/ALMANAC.md', 'w' ) do |f|
45
- f.write almanac.render( TEMPLATE )
46
- end
47
-
48
- puts "Done."
@@ -1,34 +0,0 @@
1
- # encoding: utf-8
2
- #
3
- # use to run:
4
- # ruby -I ./lib script/attributes.rb
5
-
6
-
7
- # e.g. prints attribute accessor shortcuts
8
- #
9
- # ### Geography
10
- #
11
- # - `location` => Location
12
- # - `coords` => Geographic coordinates
13
- # - `map` => Map references
14
- # ...
15
-
16
- require 'factbook'
17
-
18
-
19
- attribs= Factbook.attributes.to_a
20
-
21
- h = attribs.group_by { |a| a.category }
22
-
23
- pp h
24
-
25
- h.each do |k,v|
26
- puts ""
27
- puts "### #{k}"
28
- puts ""
29
-
30
- v.each do |a|
31
- puts "- `#{a.name}` => #{a.path.join(' › ')}"
32
- end
33
- end
34
-
@@ -1,28 +0,0 @@
1
- # encoding: utf-8
2
- #
3
- # use to run/test:
4
- # ruby -I ./lib script/build.rb
5
-
6
- require 'factbook'
7
-
8
- DB_CONFIG = {
9
- adapter: 'sqlite3',
10
- database: './factbook.db'
11
- }
12
-
13
- ActiveRecord::Base.logger = Logger.new( STDOUT )
14
- ActiveRecord::Base.establish_connection( DB_CONFIG )
15
-
16
- Factbook::CreateDb.new.up ## create tables
17
-
18
- importer = Factbook::Importer.new
19
-
20
- Factbook.codes.each do |code|
21
- puts "Fetching #{code.code}- #{code.name}..."
22
- page = Factbook::Page.new( code.code )
23
-
24
- puts "Adding #{code.code}- #{code.name}..."
25
- importer.import( page )
26
- end
27
-
28
- puts "Done."
@@ -1,145 +0,0 @@
1
- # encoding: utf-8
2
- #
3
- # use to run:
4
- # ruby -I ./lib script/counter.rb
5
-
6
- require 'factbook'
7
-
8
-
9
- c = Factbook::Counter.new
10
-
11
- ## see github.com/factbook/factbook.json (use git clone)
12
- json_dir = '../../factbook/factbook.json'
13
- codes = Factbook.codes
14
-
15
- pages = Factbook::JsonPageReader.new( json_dir ).read_pages( codes )
16
-
17
- pages.each do |page|
18
- c.count( page )
19
- end
20
-
21
- h = c.data
22
- pp h
23
-
24
- ### save to json
25
- puts "saving a copy to categories.json for debugging"
26
- File.open( "tmp/categories.json", 'w' ) do |f|
27
- f.write JSON.pretty_generate( h )
28
- end
29
-
30
-
31
-
32
- SKIP_CATEGORIES_LINES=<<EOS
33
-
34
- ######
35
- ### france plus 5 overseas regions/departments
36
-
37
- ## metropolitan France
38
- ## metropolitan France - total
39
- overseas departments
40
- French Guiana
41
- French Guiana - total
42
- Guadeloupe
43
- Guadeloupe and Martinique
44
- Martinique
45
- Mayotte
46
- Reunion
47
-
48
-
49
- ###############
50
- ### more
51
-
52
- Iles Eparses
53
- Ile Amsterdam
54
- Ile Amsterdam (Ile Amsterdam et Ile Saint-Paul)
55
- Ile Amsterdam et Ile Saint-Paul
56
- Ile Saint Paul
57
- Ile Saint-Paul (Ile Amsterdam et Ile Saint-Paul)
58
- Iles Crozet
59
- Iles Kerguelen
60
- Adelie Land
61
- Bassas da India
62
- Bassas da India (Iles Eparses)
63
- Bassas da India, Europa Island, Glorioso Islands, Juan de Nova Island (Iles Eparses)
64
- Europa Island
65
- Europa Island (Iles Eparses)
66
- Europa Island, Glorioso Islands, Juan de Nova Island
67
- Europa Island and Juan de Nova Island (Iles Eparses)
68
- Europa Island, Glorioso Islands, Juan de Nova Island (Iles Eparses)
69
- Glorioso Islands
70
- Glorioso Islands (Iles Eparses)
71
- Glorioso Island (Iles Eparses)
72
- Juan de Nova Island
73
- Juan de Nova Island (Iles Eparses)
74
- Tromelin Island
75
- Tromelin Island (Iles Eparses)
76
- Saint Helena
77
- Ascension Island
78
- Ascension
79
- Tristan da Cunha
80
- Tristan da Cunha island group
81
- Baker Island
82
- Baker, Howland, and Jarvis Islands
83
- Baker, Howland, and Jarvis Islands, and Johnston Atoll
84
- Baker, Howland, and Jarvis Islands, and Kingman Reef
85
- Howland Island
86
- Jarvis Island
87
- Johnston Atoll
88
- Johnston Atoll and Kingman Reef
89
- Kingman Reef
90
- Midway Islands
91
- Midway Islands, Johnston, and Palmyra Atolls
92
- Midway Islands and Palmyra Atoll
93
- Palmyra Atoll
94
- note on Palmyra Atoll
95
- EOS
96
-
97
- ## allow empty lines and skip comments
98
- SKIP_CATEGORIES = SKIP_CATEGORIES_LINES.split("\n").select { |item| !(item =~ /^\s*$/ || item =~ /^\s*#/) }
99
-
100
-
101
- def print_categories( data )
102
- data.each do |k,v|
103
-
104
- puts ""
105
- puts "## #{k} _(#{v[:count]})_"
106
- puts ""
107
-
108
- walk_categories( v, 1 )
109
- end
110
- end
111
-
112
- def walk_categories( data, level )
113
- data.each do |k,v|
114
- next if k == :count || k == :codes ## skip "virtual" count entry (added for stats)
115
-
116
- ## skip (sub)country entries e.g. Baker Island, Ile Amsterdam, etc.
117
- next if SKIP_CATEGORIES.include?( k )
118
-
119
- print " " * (level-1) if level > 1 ## add 4 spaces indents per extra level
120
- print "- "
121
-
122
- print "**" if level == 1 ## mark as bold
123
- print k
124
- print "**" if level == 1
125
-
126
- print " _("
127
- print v[:count]
128
- if v[:codes] ## add codes if present
129
- print " - "
130
- print v[:codes]
131
- end
132
- print ")_"
133
-
134
- print "\n"
135
-
136
- walk_categories( v, level+1)
137
- end
138
- end
139
-
140
-
141
-
142
- print_categories( c.data )
143
-
144
- puts "Done."
145
-