factbook 1.1.1 → 2.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. checksums.yaml +4 -4
  2. data/{HISTORY.md → CHANGELOG.md} +3 -3
  3. data/Manifest.txt +1 -58
  4. data/README.md +50 -575
  5. data/Rakefile +29 -33
  6. data/lib/factbook.rb +8 -75
  7. metadata +20 -114
  8. data/data/attributes.yml +0 -337
  9. data/data/categories.csv +0 -164
  10. data/data/codes.csv +0 -262
  11. data/data/codesxref.csv +0 -280
  12. data/data/comparisons.csv +0 -75
  13. data/lib/factbook/almanac.rb +0 -72
  14. data/lib/factbook/attributes.rb +0 -74
  15. data/lib/factbook/builder.rb +0 -214
  16. data/lib/factbook/builder_item.rb +0 -92
  17. data/lib/factbook/builder_json.rb +0 -79
  18. data/lib/factbook/codes.rb +0 -119
  19. data/lib/factbook/comparisons.rb +0 -50
  20. data/lib/factbook/counter.rb +0 -48
  21. data/lib/factbook/db/importer.rb +0 -92
  22. data/lib/factbook/db/models.rb +0 -11
  23. data/lib/factbook/db/schema.rb +0 -36
  24. data/lib/factbook/normalize.rb +0 -43
  25. data/lib/factbook/page.rb +0 -185
  26. data/lib/factbook/page_info.rb +0 -12
  27. data/lib/factbook/reader_json.rb +0 -51
  28. data/lib/factbook/sanitizer.rb +0 -207
  29. data/lib/factbook/sect.rb +0 -29
  30. data/lib/factbook/subsect.rb +0 -18
  31. data/lib/factbook/table.rb +0 -52
  32. data/lib/factbook/utils.rb +0 -85
  33. data/lib/factbook/utils_info.rb +0 -102
  34. data/lib/factbook/version.rb +0 -22
  35. data/script/almanac.rb +0 -48
  36. data/script/attributes.rb +0 -34
  37. data/script/build.rb +0 -28
  38. data/script/counter.rb +0 -145
  39. data/script/json.rb +0 -18
  40. data/script/testbr.rb +0 -33
  41. data/script/testcodes.rb +0 -11
  42. data/test/data/au.html +0 -579
  43. data/test/data/au.yml +0 -8
  44. data/test/data/be.html +0 -596
  45. data/test/data/be.yml +0 -8
  46. data/test/data/json/au.json +0 -892
  47. data/test/data/src/au.html +0 -2006
  48. data/test/data/src/be.html +0 -2011
  49. data/test/helper.rb +0 -11
  50. data/test/test_attribs.rb +0 -82
  51. data/test/test_attribs_def.rb +0 -20
  52. data/test/test_builder.rb +0 -35
  53. data/test/test_codes.rb +0 -76
  54. data/test/test_comparisons.rb +0 -19
  55. data/test/test_convert.rb +0 -30
  56. data/test/test_counter.rb +0 -31
  57. data/test/test_fields.rb +0 -52
  58. data/test/test_importer.rb +0 -55
  59. data/test/test_item_builder.rb +0 -99
  60. data/test/test_json.rb +0 -44
  61. data/test/test_json_builder.rb +0 -25
  62. data/test/test_normalize.rb +0 -23
  63. data/test/test_page.rb +0 -38
  64. data/test/test_sanitizer.rb +0 -35
@@ -1,29 +0,0 @@
1
- # encoding: utf-8
2
-
3
- module Factbook
4
-
5
-
6
- class Sect
7
- include LogUtils::Logging
8
-
9
- attr_accessor :title ## use name instead of title - why? why not?
10
- attr_accessor :subsects
11
-
12
- def initialize
13
- @subsects = []
14
- end
15
-
16
- def data
17
- ## convert sects to hash
18
- @data = {}
19
-
20
- subsects.each_with_index do |subsect,i|
21
- @data[ subsect.title ] = subsect.data
22
- end
23
- @data
24
- end
25
-
26
-
27
- end # class Sect
28
-
29
- end # module Factbook
@@ -1,18 +0,0 @@
1
- # encoding: utf-8
2
-
3
- module Factbook
4
-
5
-
6
- class Subsect
7
- include LogUtils::Logging
8
-
9
- attr_accessor :title ## use name instead of title - why? why not?
10
- attr_accessor :data ## hash holding data e.g. { 'text' => '...' etc. }
11
-
12
- def initialize
13
- @data = {}
14
- end
15
-
16
- end # class Subsect
17
-
18
- end # module Factbook
@@ -1,52 +0,0 @@
1
- # encoding: utf-8
2
-
3
- module Factbook
4
-
5
- ##
6
- ## make more "generic" - why? why not?
7
- ## (re)use for other files ?? move to textutils ??
8
-
9
- ##
10
- ## for now reads in rows with values separated by at least 3+ spaces e.g.:
11
- ## see www.cia.gov/library/publications/the-world-factbook/rankorder/rawdata_2119.txt
12
- ## 1 China 1,367,485,388
13
- ## 2 India 1,251,695,584
14
- ## 3 European Union 513,949,445
15
- ## 4 United States 321,368,864
16
- ## 5 Indonesia 255,993,674
17
- ## 6 Brazil 204,259,812
18
-
19
-
20
- class TableReader
21
- include LogUtils::Logging
22
-
23
-
24
- def initialize( text )
25
- @text = text
26
- end
27
-
28
- def read
29
- recs = []
30
-
31
- line_no = 0
32
- @text.each_line do |line|
33
- line_no +=1
34
- line = line.strip ## remove leading and trailing whitespace
35
- if line.empty?
36
- puts "** skipping empty line #{line_no}"
37
- next
38
- end
39
-
40
- values = line.split( /[ ]{3,}/ ) ## split three or more spaces - use just two ?? why? why not??
41
-
42
- ## puts line
43
- ## pp values
44
- recs << values
45
- end
46
- recs
47
- end
48
-
49
-
50
- end # class TableReader
51
-
52
- end # module Factbook
@@ -1,85 +0,0 @@
1
- # encoding: utf-8
2
-
3
- module Factbook
4
- module Utils
5
-
6
- ########################################
7
- ## todo: move to textutils - why, why not ?????
8
-
9
- def encode_utf8( text )
10
-
11
- errors = [] ## also return list of encoding errors
12
-
13
- ## note: factbook claims utf-8 - but includes invalid bytes in some pages
14
- ## encoding is likley wester/windows-
15
-
16
- ## note:
17
- ## use � - unknown/invalid unicode char
18
- ## fix/todo: use ASCII-8BIT instead of binnary
19
- text = text.encode('UTF-8', 'binary', :invalid => :replace,
20
- :undef => :replace,
21
- :replace => '�' )
22
-
23
- ## check for replaced/invalid chars and log warrning
24
- pos = text.index( '�' )
25
- while pos
26
- from = pos-10 ## tood/fix: use min/max to check for bounds - why? why not??
27
- to = pos+10
28
- around = text[from..to]
29
- puts " pos #{pos}, from #{from}, to #{to}, around >#{around}<"
30
- msg = "invalid char on pos #{pos} around: >#{around}<"
31
- puts msg
32
- ## also log message / w timestamp
33
-
34
- errors << "#{Time.now} - #{msg}"
35
-
36
- pos = text.index( '�', pos+1 )
37
- end
38
-
39
- [text,errors] ## return text and errors (list)
40
- end
41
-
42
-
43
-
44
- def values_to_csv( values )
45
- buf = ""
46
- values.each_with_index do |value,i|
47
- buf << ',' if i > 0 ## add comma (except for first value)
48
- ## note: allow optional $ sign e.g. $100,000,000
49
- ## !!!! todo/fix: allow optional minus e.g. -44,000
50
- if value =~ /^\$?[1-9][,0-9]+[0-9]$/ ### find a better regex - why? why not??
51
- ## check if number e.g. 17,098,242 or $17,098,242
52
- ## remove commas 17098242
53
- buf << value.gsub( ',', '' )
54
- elsif value.index( ',').nil?
55
- ## add as is 1:1 (no commana)
56
- buf << value
57
- else
58
- ## escape comma with double quote
59
- # e.g. Guam, The becomes "Guam, The"
60
- buf << '"'
61
- buf << value
62
- buf << '"'
63
- end
64
- end
65
- buf
66
- end
67
-
68
-
69
- def data_to_csv( recs, headers )
70
- text = ""
71
-
72
- text << values_to_csv( headers )
73
- text << "\n"
74
-
75
- recs.each do |rec|
76
- text << values_to_csv( rec )
77
- text << "\n"
78
- end
79
-
80
- text
81
- end
82
-
83
-
84
- end # module Utils
85
- end # module Factbook
@@ -1,102 +0,0 @@
1
- # encoding: utf-8
2
-
3
- module Factbook
4
- module Utils
5
-
6
- #######
7
- ## find meta data (about page info)
8
-
9
-
10
- #### e.g. Page last updated on September 16, 2015
11
-
12
- MONTH_EN_TO_S={
13
- 'January' => '1',
14
- 'February' => '2',
15
- 'March' => '3',
16
- 'April' => '4',
17
- 'May' => '5',
18
- 'June' => '6',
19
- 'July' => '7',
20
- 'August' => '8',
21
- 'September' => '9',
22
- 'October' => '10',
23
- 'November' => '11',
24
- 'December' => '12'
25
- }
26
-
27
- PAGE_LAST_UPDATED_REGEX = /
28
- Page \s last \s updated \s on \s
29
- (?<month_en>[a-z]+) \s
30
- (?<day>\d{1,2}), \s
31
- (?<year>\d{4})
32
- /imx
33
-
34
- def find_page_last_updated( html )
35
- m = PAGE_LAST_UPDATED_REGEX.match( html )
36
- if m
37
- pp m
38
- month_en = m[:month_en]
39
- day = m[:day]
40
- year = m[:year]
41
- puts "** bingo - month #{month_en}, day #{day}, year #{year}"
42
-
43
- month = MONTH_EN_TO_S[ month_en ]
44
- date_str = "#{year}-#{month}-#{day}"
45
- pp date_str
46
- date = Date.strptime( date_str, '%Y-%m-%d' )
47
- date
48
- else
49
- nil
50
- end
51
- end
52
-
53
- ##
54
- ## e.g. regioncode="eur"
55
- ## countrycode="au"
56
- ## countryname="Austria"
57
- ## flagsubfield=""
58
- ## countryaffiliation=""
59
- ## flagdescription=""
60
- ## flagdescriptionnote=""
61
- ## region="Europe"
62
- ##
63
- ## note: countryaffiliation may be empty
64
-
65
-
66
-
67
- PAGE_INFO_REGEX = /
68
- regioncode=(?<q1>"|')(?<region_code>.+?)\k<q1>
69
- \s+
70
- countrycode=(?<q2>"|')(?<country_code>.+?)\k<q2> ## is k<3> backref
71
- \s+
72
- countryname=(?<q3>"|')(?<country>.+?)\k<q3>
73
- \s+
74
- [^>]+? ## allow any attribs (note: non-greedy)
75
- countryaffiliation=(?<q4>"|')(?<affiliation>.*?)\k<q4> ## note: might be empty
76
- \s+
77
- [^>]+? ## allow any attribs (note: non-greedy)
78
- region=(?<q5>"|')(?<region>.+?)\k<q5> ## check world - might be empty ?? or for ocean ??
79
- /imx
80
-
81
-
82
- def find_page_info( html )
83
- m = PAGE_INFO_REGEX.match( html )
84
- if m
85
- pp m
86
-
87
- h = { country_code: m[:country_code],
88
- country_name: m[:country],
89
- country_affiliation: m[:affiliation],
90
- region_code: m[:region_code],
91
- region_name: m[:region] }
92
-
93
- puts "** bingo - #{h.inspect}"
94
- h ## return hash w/ name-value pairs
95
- else
96
- nil ## or return empty struct with nils/empty strings - why?? why not??
97
- end
98
- end
99
-
100
-
101
- end # module Utils
102
- end # module Factbook
@@ -1,22 +0,0 @@
1
- # encoding: utf-8
2
-
3
- module Factbook
4
-
5
- MAJOR = 1
6
- MINOR = 1
7
- PATCH = 1
8
- VERSION = [MAJOR,MINOR,PATCH].join('.')
9
-
10
- def self.version
11
- VERSION
12
- end
13
-
14
- def self.banner
15
- "factbook/#{VERSION} on Ruby #{RUBY_VERSION} (#{RUBY_RELEASE_DATE}) [#{RUBY_PLATFORM}]"
16
- end
17
-
18
- def self.root
19
- "#{File.expand_path( File.dirname(File.dirname(File.dirname(__FILE__))) )}"
20
- end
21
-
22
- end
@@ -1,48 +0,0 @@
1
- # encoding: utf-8
2
- #
3
- # use to run:
4
- # ruby -I ./lib script/almanac.rb
5
-
6
-
7
- require 'factbook'
8
-
9
-
10
- TEMPLATE = <<EOS
11
-
12
- ### <%= names %>
13
-
14
- <%= page.name_long=='none' ? '\-' : page.name_long %> › <%= page.map %> -- <%= page.location %> <br>
15
- <%= page.capital %> • <%= page.area %> • pop. <%= page.population %>
16
-
17
- **Languages:** <%= page.languages %>
18
- **Major cities:** <%= page.major_cities %>
19
- **Ethnic groups:** <%= page.ethnic_groups %>
20
- **Religions:** <%= page.religions %>
21
- **Independence:** <%= page.independence %>
22
-
23
- **Internet:** `<%= page.internet %>` • <%= page.internet_users %> • <%= page.internet_users_rate %>
24
- **Telephones - mobile:** <%= page.telephones_mobile %> • <%= page.telephones_mobile_subscriptions %> subs./100
25
-
26
- EOS
27
-
28
-
29
- #########################
30
- ### read all countries
31
- ### using local json (dump) files
32
-
33
- ## see github.com/factbook/factbook.json (use git clone)
34
- json_dir = '../../factbook/factbook.json'
35
- codes = Factbook.codes.countries
36
- ## todo: add tawain and ?? from others - why, why not??
37
-
38
- pages = Factbook::JsonPageReader.new( json_dir ).read_pages( codes )
39
-
40
- almanac = Factbook::Almanac.new( pages )
41
-
42
- ## save to disk
43
-
44
- File.open( './tmp/ALMANAC.md', 'w' ) do |f|
45
- f.write almanac.render( TEMPLATE )
46
- end
47
-
48
- puts "Done."
@@ -1,34 +0,0 @@
1
- # encoding: utf-8
2
- #
3
- # use to run:
4
- # ruby -I ./lib script/attributes.rb
5
-
6
-
7
- # e.g. prints attribute accessor shortcuts
8
- #
9
- # ### Geography
10
- #
11
- # - `location` => Location
12
- # - `coords` => Geographic coordinates
13
- # - `map` => Map references
14
- # ...
15
-
16
- require 'factbook'
17
-
18
-
19
- attribs= Factbook.attributes.to_a
20
-
21
- h = attribs.group_by { |a| a.category }
22
-
23
- pp h
24
-
25
- h.each do |k,v|
26
- puts ""
27
- puts "### #{k}"
28
- puts ""
29
-
30
- v.each do |a|
31
- puts "- `#{a.name}` => #{a.path.join(' › ')}"
32
- end
33
- end
34
-
@@ -1,28 +0,0 @@
1
- # encoding: utf-8
2
- #
3
- # use to run/test:
4
- # ruby -I ./lib script/build.rb
5
-
6
- require 'factbook'
7
-
8
- DB_CONFIG = {
9
- adapter: 'sqlite3',
10
- database: './factbook.db'
11
- }
12
-
13
- ActiveRecord::Base.logger = Logger.new( STDOUT )
14
- ActiveRecord::Base.establish_connection( DB_CONFIG )
15
-
16
- Factbook::CreateDb.new.up ## create tables
17
-
18
- importer = Factbook::Importer.new
19
-
20
- Factbook.codes.each do |code|
21
- puts "Fetching #{code.code}- #{code.name}..."
22
- page = Factbook::Page.new( code.code )
23
-
24
- puts "Adding #{code.code}- #{code.name}..."
25
- importer.import( page )
26
- end
27
-
28
- puts "Done."
@@ -1,145 +0,0 @@
1
- # encoding: utf-8
2
- #
3
- # use to run:
4
- # ruby -I ./lib script/counter.rb
5
-
6
- require 'factbook'
7
-
8
-
9
- c = Factbook::Counter.new
10
-
11
- ## see github.com/factbook/factbook.json (use git clone)
12
- json_dir = '../../factbook/factbook.json'
13
- codes = Factbook.codes
14
-
15
- pages = Factbook::JsonPageReader.new( json_dir ).read_pages( codes )
16
-
17
- pages.each do |page|
18
- c.count( page )
19
- end
20
-
21
- h = c.data
22
- pp h
23
-
24
- ### save to json
25
- puts "saving a copy to categories.json for debugging"
26
- File.open( "tmp/categories.json", 'w' ) do |f|
27
- f.write JSON.pretty_generate( h )
28
- end
29
-
30
-
31
-
32
- SKIP_CATEGORIES_LINES=<<EOS
33
-
34
- ######
35
- ### france plus 5 overseas regions/departments
36
-
37
- ## metropolitan France
38
- ## metropolitan France - total
39
- overseas departments
40
- French Guiana
41
- French Guiana - total
42
- Guadeloupe
43
- Guadeloupe and Martinique
44
- Martinique
45
- Mayotte
46
- Reunion
47
-
48
-
49
- ###############
50
- ### more
51
-
52
- Iles Eparses
53
- Ile Amsterdam
54
- Ile Amsterdam (Ile Amsterdam et Ile Saint-Paul)
55
- Ile Amsterdam et Ile Saint-Paul
56
- Ile Saint Paul
57
- Ile Saint-Paul (Ile Amsterdam et Ile Saint-Paul)
58
- Iles Crozet
59
- Iles Kerguelen
60
- Adelie Land
61
- Bassas da India
62
- Bassas da India (Iles Eparses)
63
- Bassas da India, Europa Island, Glorioso Islands, Juan de Nova Island (Iles Eparses)
64
- Europa Island
65
- Europa Island (Iles Eparses)
66
- Europa Island, Glorioso Islands, Juan de Nova Island
67
- Europa Island and Juan de Nova Island (Iles Eparses)
68
- Europa Island, Glorioso Islands, Juan de Nova Island (Iles Eparses)
69
- Glorioso Islands
70
- Glorioso Islands (Iles Eparses)
71
- Glorioso Island (Iles Eparses)
72
- Juan de Nova Island
73
- Juan de Nova Island (Iles Eparses)
74
- Tromelin Island
75
- Tromelin Island (Iles Eparses)
76
- Saint Helena
77
- Ascension Island
78
- Ascension
79
- Tristan da Cunha
80
- Tristan da Cunha island group
81
- Baker Island
82
- Baker, Howland, and Jarvis Islands
83
- Baker, Howland, and Jarvis Islands, and Johnston Atoll
84
- Baker, Howland, and Jarvis Islands, and Kingman Reef
85
- Howland Island
86
- Jarvis Island
87
- Johnston Atoll
88
- Johnston Atoll and Kingman Reef
89
- Kingman Reef
90
- Midway Islands
91
- Midway Islands, Johnston, and Palmyra Atolls
92
- Midway Islands and Palmyra Atoll
93
- Palmyra Atoll
94
- note on Palmyra Atoll
95
- EOS
96
-
97
- ## allow empty lines and skip comments
98
- SKIP_CATEGORIES = SKIP_CATEGORIES_LINES.split("\n").select { |item| !(item =~ /^\s*$/ || item =~ /^\s*#/) }
99
-
100
-
101
- def print_categories( data )
102
- data.each do |k,v|
103
-
104
- puts ""
105
- puts "## #{k} _(#{v[:count]})_"
106
- puts ""
107
-
108
- walk_categories( v, 1 )
109
- end
110
- end
111
-
112
- def walk_categories( data, level )
113
- data.each do |k,v|
114
- next if k == :count || k == :codes ## skip "virtual" count entry (added for stats)
115
-
116
- ## skip (sub)country entries e.g. Baker Island, Ile Amsterdam, etc.
117
- next if SKIP_CATEGORIES.include?( k )
118
-
119
- print " " * (level-1) if level > 1 ## add 4 spaces indents per extra level
120
- print "- "
121
-
122
- print "**" if level == 1 ## mark as bold
123
- print k
124
- print "**" if level == 1
125
-
126
- print " _("
127
- print v[:count]
128
- if v[:codes] ## add codes if present
129
- print " - "
130
- print v[:codes]
131
- end
132
- print ")_"
133
-
134
- print "\n"
135
-
136
- walk_categories( v, level+1)
137
- end
138
- end
139
-
140
-
141
-
142
- print_categories( c.data )
143
-
144
- puts "Done."
145
-