factbook 2.0.0 → 2.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +1 -1
  3. data/Manifest.txt +0 -61
  4. data/README.md +8 -506
  5. data/Rakefile +4 -9
  6. data/lib/factbook.rb +4 -64
  7. metadata +6 -124
  8. data/data/attributes.yml +0 -337
  9. data/data/categories.csv +0 -164
  10. data/data/codes.csv +0 -262
  11. data/data/codesxref.csv +0 -280
  12. data/data/comparisons.csv +0 -75
  13. data/lib/factbook/almanac.rb +0 -72
  14. data/lib/factbook/attributes.rb +0 -74
  15. data/lib/factbook/builder.rb +0 -212
  16. data/lib/factbook/builder_item.rb +0 -126
  17. data/lib/factbook/builder_json.rb +0 -79
  18. data/lib/factbook/codes.rb +0 -119
  19. data/lib/factbook/comparisons.rb +0 -50
  20. data/lib/factbook/counter.rb +0 -48
  21. data/lib/factbook/db/importer.rb +0 -92
  22. data/lib/factbook/db/models.rb +0 -11
  23. data/lib/factbook/db/schema.rb +0 -36
  24. data/lib/factbook/normalize.rb +0 -43
  25. data/lib/factbook/page.rb +0 -148
  26. data/lib/factbook/page_info.rb +0 -12
  27. data/lib/factbook/reader_json.rb +0 -51
  28. data/lib/factbook/sanitizer.rb +0 -178
  29. data/lib/factbook/sect.rb +0 -29
  30. data/lib/factbook/subsect.rb +0 -18
  31. data/lib/factbook/table.rb +0 -52
  32. data/lib/factbook/utils.rb +0 -85
  33. data/lib/factbook/utils_info.rb +0 -129
  34. data/lib/factbook/version.rb +0 -21
  35. data/script/almanac.rb +0 -48
  36. data/script/attributes.rb +0 -34
  37. data/script/build.rb +0 -28
  38. data/script/counter.rb +0 -145
  39. data/script/json.rb +0 -19
  40. data/script/testbr.rb +0 -33
  41. data/script/testcodes.rb +0 -11
  42. data/test/data/au.html +0 -579
  43. data/test/data/au.yml +0 -8
  44. data/test/data/be.html +0 -596
  45. data/test/data/be.yml +0 -8
  46. data/test/data/json/au.json +0 -892
  47. data/test/data/src/ag.html +0 -716
  48. data/test/data/src/au-2015-09-24.html +0 -2006
  49. data/test/data/src/au.html +0 -658
  50. data/test/data/src/be-2015-09-24.html +0 -2011
  51. data/test/data/src/be.html +0 -648
  52. data/test/helper.rb +0 -11
  53. data/test/test_attribs.rb +0 -87
  54. data/test/test_attribs_def.rb +0 -20
  55. data/test/test_builder.rb +0 -35
  56. data/test/test_codes.rb +0 -76
  57. data/test/test_comparisons.rb +0 -19
  58. data/test/test_convert.rb +0 -30
  59. data/test/test_counter.rb +0 -31
  60. data/test/test_fields.rb +0 -52
  61. data/test/test_importer.rb +0 -56
  62. data/test/test_item_builder.rb +0 -99
  63. data/test/test_json.rb +0 -45
  64. data/test/test_json_builder.rb +0 -25
  65. data/test/test_normalize.rb +0 -23
  66. data/test/test_page.rb +0 -38
  67. data/test/test_sanitizer.rb +0 -39
  68. data/test/test_sanitizer_regex.rb +0 -89
@@ -1,85 +0,0 @@
1
- # encoding: utf-8
2
-
3
- module Factbook
4
- module Utils
5
-
6
- ########################################
7
- ## todo: move to textutils - why, why not ?????
8
-
9
- def encode_utf8( text )
10
-
11
- errors = [] ## also return list of encoding errors
12
-
13
- ## note: factbook claims utf-8 - but includes invalid bytes in some pages
14
- ## encoding is likley wester/windows-
15
-
16
- ## note:
17
- ## use � - unknown/invalid unicode char
18
- ## fix/todo: use ASCII-8BIT instead of binnary
19
- text = text.encode('UTF-8', 'binary', :invalid => :replace,
20
- :undef => :replace,
21
- :replace => '�' )
22
-
23
- ## check for replaced/invalid chars and log warrning
24
- pos = text.index( '�' )
25
- while pos
26
- from = pos-10 ## tood/fix: use min/max to check for bounds - why? why not??
27
- to = pos+10
28
- around = text[from..to]
29
- puts " pos #{pos}, from #{from}, to #{to}, around >#{around}<"
30
- msg = "invalid char on pos #{pos} around: >#{around}<"
31
- puts msg
32
- ## also log message / w timestamp
33
-
34
- errors << "#{Time.now} - #{msg}"
35
-
36
- pos = text.index( '�', pos+1 )
37
- end
38
-
39
- [text,errors] ## return text and errors (list)
40
- end
41
-
42
-
43
-
44
- def values_to_csv( values )
45
- buf = ""
46
- values.each_with_index do |value,i|
47
- buf << ',' if i > 0 ## add comma (except for first value)
48
- ## note: allow optional $ sign e.g. $100,000,000
49
- ## !!!! todo/fix: allow optional minus e.g. -44,000
50
- if value =~ /^\$?[1-9][,0-9]+[0-9]$/ ### find a better regex - why? why not??
51
- ## check if number e.g. 17,098,242 or $17,098,242
52
- ## remove commas 17098242
53
- buf << value.gsub( ',', '' )
54
- elsif value.index( ',').nil?
55
- ## add as is 1:1 (no commana)
56
- buf << value
57
- else
58
- ## escape comma with double quote
59
- # e.g. Guam, The becomes "Guam, The"
60
- buf << '"'
61
- buf << value
62
- buf << '"'
63
- end
64
- end
65
- buf
66
- end
67
-
68
-
69
- def data_to_csv( recs, headers )
70
- text = ""
71
-
72
- text << values_to_csv( headers )
73
- text << "\n"
74
-
75
- recs.each do |rec|
76
- text << values_to_csv( rec )
77
- text << "\n"
78
- end
79
-
80
- text
81
- end
82
-
83
-
84
- end # module Utils
85
- end # module Factbook
@@ -1,129 +0,0 @@
1
- # encoding: utf-8
2
-
3
- module Factbook
4
- module Utils
5
-
6
- #######
7
- ## find meta data (about page info)
8
-
9
-
10
- #### e.g. Page last updated on September 16, 2015
11
-
12
- MONTH_EN_TO_S={
13
- 'January' => '1',
14
- 'February' => '2',
15
- 'March' => '3',
16
- 'April' => '4',
17
- 'May' => '5',
18
- 'June' => '6',
19
- 'July' => '7',
20
- 'August' => '8',
21
- 'September' => '9',
22
- 'October' => '10',
23
- 'November' => '11',
24
- 'December' => '12'
25
- }
26
-
27
-
28
-
29
- ##
30
- # examples (to match):
31
- # Page last updated on November 03, 2016
32
- # Page last updated on September 24, 2015
33
-
34
- PAGE_LAST_UPDATED_REGEX = /
35
- Page \s last \s updated \s on \s
36
- (?<month_en>[a-z]+) \s
37
- (?<day>\d{1,2}), \s
38
- (?<year>\d{4})
39
- /imx
40
-
41
- def find_page_last_updated( html )
42
- m = PAGE_LAST_UPDATED_REGEX.match( html )
43
- if m
44
- pp m
45
- month_en = m[:month_en]
46
- day = m[:day]
47
- year = m[:year]
48
- puts "** bingo - month #{month_en}, day #{day}, year #{year}"
49
-
50
- month = MONTH_EN_TO_S[ month_en ]
51
- date_str = "#{year}-#{month}-#{day}"
52
- pp date_str
53
- date = Date.strptime( date_str, '%Y-%m-%d' )
54
- date
55
- else
56
- nil
57
- end
58
- end
59
-
60
-
61
- ## fallback: find "standalone" country coude
62
- ## e.g.
63
- ## ccode='au'
64
-
65
- COUNTRY_CODE_REGEX = /ccode='(?<cc>[a-z]+)'/
66
-
67
- def find_country_code( html )
68
- m = COUNTRY_CODE_REGEX.match( html )
69
- if m
70
- pp m
71
- cc = m[:cc]
72
- puts "** bingo - country code #{cc}"
73
- cc
74
- else
75
- nil
76
- end
77
- end
78
-
79
-
80
- ##
81
- ## e.g. regioncode="eur"
82
- ## countrycode="au"
83
- ## countryname="Austria"
84
- ## flagsubfield=""
85
- ## countryaffiliation=""
86
- ## flagdescription=""
87
- ## flagdescriptionnote=""
88
- ## region="Europe"
89
- ##
90
- ## note: countryaffiliation may be empty
91
-
92
-
93
-
94
- PAGE_INFO_REGEX = /
95
- regioncode=(?<q1>"|')(?<region_code>.+?)\k<q1>
96
- \s+
97
- countrycode=(?<q2>"|')(?<country_code>.+?)\k<q2> ## is k<3> backref
98
- \s+
99
- countryname=(?<q3>"|')(?<country>.+?)\k<q3>
100
- \s+
101
- [^>]+? ## allow any attribs (note: non-greedy)
102
- countryaffiliation=(?<q4>"|')(?<affiliation>.*?)\k<q4> ## note: might be empty
103
- \s+
104
- [^>]+? ## allow any attribs (note: non-greedy)
105
- region=(?<q5>"|')(?<region>.+?)\k<q5> ## check world - might be empty ?? or for ocean ??
106
- /imx
107
-
108
-
109
- def find_page_info( html )
110
- m = PAGE_INFO_REGEX.match( html )
111
- if m
112
- pp m
113
-
114
- h = { country_code: m[:country_code],
115
- country_name: m[:country],
116
- country_affiliation: m[:affiliation],
117
- region_code: m[:region_code],
118
- region_name: m[:region] }
119
-
120
- puts "** bingo - #{h.inspect}"
121
- h ## return hash w/ name-value pairs
122
- else
123
- nil ## or return empty struct with nils/empty strings - why?? why not??
124
- end
125
- end
126
-
127
-
128
- end # module Utils
129
- end # module Factbook
@@ -1,21 +0,0 @@
1
-
2
- module Factbook
3
-
4
- MAJOR = 2
5
- MINOR = 0
6
- PATCH = 0
7
- VERSION = [MAJOR,MINOR,PATCH].join('.')
8
-
9
- def self.version
10
- VERSION
11
- end
12
-
13
- def self.banner
14
- "factbook/#{VERSION} on Ruby #{RUBY_VERSION} (#{RUBY_RELEASE_DATE}) [#{RUBY_PLATFORM}]"
15
- end
16
-
17
- def self.root
18
- File.expand_path( File.dirname(File.dirname(File.dirname(__FILE__))) )
19
- end
20
-
21
- end
@@ -1,48 +0,0 @@
1
- # encoding: utf-8
2
- #
3
- # use to run:
4
- # ruby -I ./lib script/almanac.rb
5
-
6
-
7
- require 'factbook'
8
-
9
-
10
- TEMPLATE = <<EOS
11
-
12
- ### <%= names %>
13
-
14
- <%= page.name_long=='none' ? '\-' : page.name_long %> › <%= page.map %> -- <%= page.location %> <br>
15
- <%= page.capital %> • <%= page.area %> • pop. <%= page.population %>
16
-
17
- **Languages:** <%= page.languages %>
18
- **Major cities:** <%= page.major_cities %>
19
- **Ethnic groups:** <%= page.ethnic_groups %>
20
- **Religions:** <%= page.religions %>
21
- **Independence:** <%= page.independence %>
22
-
23
- **Internet:** `<%= page.internet %>` • <%= page.internet_users %> • <%= page.internet_users_rate %>
24
- **Telephones - mobile:** <%= page.telephones_mobile %> • <%= page.telephones_mobile_subscriptions %> subs./100
25
-
26
- EOS
27
-
28
-
29
- #########################
30
- ### read all countries
31
- ### using local json (dump) files
32
-
33
- ## see github.com/factbook/factbook.json (use git clone)
34
- json_dir = '../../factbook/factbook.json'
35
- codes = Factbook.codes.countries
36
- ## todo: add tawain and ?? from others - why, why not??
37
-
38
- pages = Factbook::JsonPageReader.new( json_dir ).read_pages( codes )
39
-
40
- almanac = Factbook::Almanac.new( pages )
41
-
42
- ## save to disk
43
-
44
- File.open( './tmp/ALMANAC.md', 'w' ) do |f|
45
- f.write almanac.render( TEMPLATE )
46
- end
47
-
48
- puts "Done."
@@ -1,34 +0,0 @@
1
- # encoding: utf-8
2
- #
3
- # use to run:
4
- # ruby -I ./lib script/attributes.rb
5
-
6
-
7
- # e.g. prints attribute accessor shortcuts
8
- #
9
- # ### Geography
10
- #
11
- # - `location` => Location
12
- # - `coords` => Geographic coordinates
13
- # - `map` => Map references
14
- # ...
15
-
16
- require 'factbook'
17
-
18
-
19
- attribs= Factbook.attributes.to_a
20
-
21
- h = attribs.group_by { |a| a.category }
22
-
23
- pp h
24
-
25
- h.each do |k,v|
26
- puts ""
27
- puts "### #{k}"
28
- puts ""
29
-
30
- v.each do |a|
31
- puts "- `#{a.name}` => #{a.path.join(' › ')}"
32
- end
33
- end
34
-
@@ -1,28 +0,0 @@
1
- # encoding: utf-8
2
- #
3
- # use to run/test:
4
- # ruby -I ./lib script/build.rb
5
-
6
- require 'factbook'
7
-
8
- DB_CONFIG = {
9
- adapter: 'sqlite3',
10
- database: './factbook.db'
11
- }
12
-
13
- ActiveRecord::Base.logger = Logger.new( STDOUT )
14
- ActiveRecord::Base.establish_connection( DB_CONFIG )
15
-
16
- Factbook::CreateDb.new.up ## create tables
17
-
18
- importer = Factbook::Importer.new
19
-
20
- Factbook.codes.each do |code|
21
- puts "Fetching #{code.code}- #{code.name}..."
22
- page = Factbook::Page.new( code.code )
23
-
24
- puts "Adding #{code.code}- #{code.name}..."
25
- importer.import( page )
26
- end
27
-
28
- puts "Done."
@@ -1,145 +0,0 @@
1
- # encoding: utf-8
2
- #
3
- # use to run:
4
- # ruby -I ./lib script/counter.rb
5
-
6
- require 'factbook'
7
-
8
-
9
- c = Factbook::Counter.new
10
-
11
- ## see github.com/factbook/factbook.json (use git clone)
12
- json_dir = '../../factbook/factbook.json'
13
- codes = Factbook.codes
14
-
15
- pages = Factbook::JsonPageReader.new( json_dir ).read_pages( codes )
16
-
17
- pages.each do |page|
18
- c.count( page )
19
- end
20
-
21
- h = c.data
22
- pp h
23
-
24
- ### save to json
25
- puts "saving a copy to categories.json for debugging"
26
- File.open( "tmp/categories.json", 'w' ) do |f|
27
- f.write JSON.pretty_generate( h )
28
- end
29
-
30
-
31
-
32
- SKIP_CATEGORIES_LINES=<<EOS
33
-
34
- ######
35
- ### france plus 5 overseas regions/departments
36
-
37
- ## metropolitan France
38
- ## metropolitan France - total
39
- overseas departments
40
- French Guiana
41
- French Guiana - total
42
- Guadeloupe
43
- Guadeloupe and Martinique
44
- Martinique
45
- Mayotte
46
- Reunion
47
-
48
-
49
- ###############
50
- ### more
51
-
52
- Iles Eparses
53
- Ile Amsterdam
54
- Ile Amsterdam (Ile Amsterdam et Ile Saint-Paul)
55
- Ile Amsterdam et Ile Saint-Paul
56
- Ile Saint Paul
57
- Ile Saint-Paul (Ile Amsterdam et Ile Saint-Paul)
58
- Iles Crozet
59
- Iles Kerguelen
60
- Adelie Land
61
- Bassas da India
62
- Bassas da India (Iles Eparses)
63
- Bassas da India, Europa Island, Glorioso Islands, Juan de Nova Island (Iles Eparses)
64
- Europa Island
65
- Europa Island (Iles Eparses)
66
- Europa Island, Glorioso Islands, Juan de Nova Island
67
- Europa Island and Juan de Nova Island (Iles Eparses)
68
- Europa Island, Glorioso Islands, Juan de Nova Island (Iles Eparses)
69
- Glorioso Islands
70
- Glorioso Islands (Iles Eparses)
71
- Glorioso Island (Iles Eparses)
72
- Juan de Nova Island
73
- Juan de Nova Island (Iles Eparses)
74
- Tromelin Island
75
- Tromelin Island (Iles Eparses)
76
- Saint Helena
77
- Ascension Island
78
- Ascension
79
- Tristan da Cunha
80
- Tristan da Cunha island group
81
- Baker Island
82
- Baker, Howland, and Jarvis Islands
83
- Baker, Howland, and Jarvis Islands, and Johnston Atoll
84
- Baker, Howland, and Jarvis Islands, and Kingman Reef
85
- Howland Island
86
- Jarvis Island
87
- Johnston Atoll
88
- Johnston Atoll and Kingman Reef
89
- Kingman Reef
90
- Midway Islands
91
- Midway Islands, Johnston, and Palmyra Atolls
92
- Midway Islands and Palmyra Atoll
93
- Palmyra Atoll
94
- note on Palmyra Atoll
95
- EOS
96
-
97
- ## allow empty lines and skip comments
98
- SKIP_CATEGORIES = SKIP_CATEGORIES_LINES.split("\n").select { |item| !(item =~ /^\s*$/ || item =~ /^\s*#/) }
99
-
100
-
101
- def print_categories( data )
102
- data.each do |k,v|
103
-
104
- puts ""
105
- puts "## #{k} _(#{v[:count]})_"
106
- puts ""
107
-
108
- walk_categories( v, 1 )
109
- end
110
- end
111
-
112
- def walk_categories( data, level )
113
- data.each do |k,v|
114
- next if k == :count || k == :codes ## skip "virtual" count entry (added for stats)
115
-
116
- ## skip (sub)country entries e.g. Baker Island, Ile Amsterdam, etc.
117
- next if SKIP_CATEGORIES.include?( k )
118
-
119
- print " " * (level-1) if level > 1 ## add 4 spaces indents per extra level
120
- print "- "
121
-
122
- print "**" if level == 1 ## mark as bold
123
- print k
124
- print "**" if level == 1
125
-
126
- print " _("
127
- print v[:count]
128
- if v[:codes] ## add codes if present
129
- print " - "
130
- print v[:codes]
131
- end
132
- print ")_"
133
-
134
- print "\n"
135
-
136
- walk_categories( v, level+1)
137
- end
138
- end
139
-
140
-
141
-
142
- print_categories( c.data )
143
-
144
- puts "Done."
145
-