factbook 2.0.0 → 2.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (68) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +1 -1
  3. data/Manifest.txt +0 -61
  4. data/README.md +8 -506
  5. data/Rakefile +4 -9
  6. data/lib/factbook.rb +4 -64
  7. metadata +6 -124
  8. data/data/attributes.yml +0 -337
  9. data/data/categories.csv +0 -164
  10. data/data/codes.csv +0 -262
  11. data/data/codesxref.csv +0 -280
  12. data/data/comparisons.csv +0 -75
  13. data/lib/factbook/almanac.rb +0 -72
  14. data/lib/factbook/attributes.rb +0 -74
  15. data/lib/factbook/builder.rb +0 -212
  16. data/lib/factbook/builder_item.rb +0 -126
  17. data/lib/factbook/builder_json.rb +0 -79
  18. data/lib/factbook/codes.rb +0 -119
  19. data/lib/factbook/comparisons.rb +0 -50
  20. data/lib/factbook/counter.rb +0 -48
  21. data/lib/factbook/db/importer.rb +0 -92
  22. data/lib/factbook/db/models.rb +0 -11
  23. data/lib/factbook/db/schema.rb +0 -36
  24. data/lib/factbook/normalize.rb +0 -43
  25. data/lib/factbook/page.rb +0 -148
  26. data/lib/factbook/page_info.rb +0 -12
  27. data/lib/factbook/reader_json.rb +0 -51
  28. data/lib/factbook/sanitizer.rb +0 -178
  29. data/lib/factbook/sect.rb +0 -29
  30. data/lib/factbook/subsect.rb +0 -18
  31. data/lib/factbook/table.rb +0 -52
  32. data/lib/factbook/utils.rb +0 -85
  33. data/lib/factbook/utils_info.rb +0 -129
  34. data/lib/factbook/version.rb +0 -21
  35. data/script/almanac.rb +0 -48
  36. data/script/attributes.rb +0 -34
  37. data/script/build.rb +0 -28
  38. data/script/counter.rb +0 -145
  39. data/script/json.rb +0 -19
  40. data/script/testbr.rb +0 -33
  41. data/script/testcodes.rb +0 -11
  42. data/test/data/au.html +0 -579
  43. data/test/data/au.yml +0 -8
  44. data/test/data/be.html +0 -596
  45. data/test/data/be.yml +0 -8
  46. data/test/data/json/au.json +0 -892
  47. data/test/data/src/ag.html +0 -716
  48. data/test/data/src/au-2015-09-24.html +0 -2006
  49. data/test/data/src/au.html +0 -658
  50. data/test/data/src/be-2015-09-24.html +0 -2011
  51. data/test/data/src/be.html +0 -648
  52. data/test/helper.rb +0 -11
  53. data/test/test_attribs.rb +0 -87
  54. data/test/test_attribs_def.rb +0 -20
  55. data/test/test_builder.rb +0 -35
  56. data/test/test_codes.rb +0 -76
  57. data/test/test_comparisons.rb +0 -19
  58. data/test/test_convert.rb +0 -30
  59. data/test/test_counter.rb +0 -31
  60. data/test/test_fields.rb +0 -52
  61. data/test/test_importer.rb +0 -56
  62. data/test/test_item_builder.rb +0 -99
  63. data/test/test_json.rb +0 -45
  64. data/test/test_json_builder.rb +0 -25
  65. data/test/test_normalize.rb +0 -23
  66. data/test/test_page.rb +0 -38
  67. data/test/test_sanitizer.rb +0 -39
  68. data/test/test_sanitizer_regex.rb +0 -89
@@ -1,85 +0,0 @@
1
- # encoding: utf-8
2
-
3
- module Factbook
4
- module Utils
5
-
6
- ########################################
7
- ## todo: move to textutils - why, why not ?????
8
-
9
- def encode_utf8( text )
10
-
11
- errors = [] ## also return list of encoding errors
12
-
13
- ## note: factbook claims utf-8 - but includes invalid bytes in some pages
14
- ## encoding is likley wester/windows-
15
-
16
- ## note:
17
- ## use � - unknown/invalid unicode char
18
- ## fix/todo: use ASCII-8BIT instead of binnary
19
- text = text.encode('UTF-8', 'binary', :invalid => :replace,
20
- :undef => :replace,
21
- :replace => '�' )
22
-
23
- ## check for replaced/invalid chars and log warrning
24
- pos = text.index( '�' )
25
- while pos
26
- from = pos-10 ## tood/fix: use min/max to check for bounds - why? why not??
27
- to = pos+10
28
- around = text[from..to]
29
- puts " pos #{pos}, from #{from}, to #{to}, around >#{around}<"
30
- msg = "invalid char on pos #{pos} around: >#{around}<"
31
- puts msg
32
- ## also log message / w timestamp
33
-
34
- errors << "#{Time.now} - #{msg}"
35
-
36
- pos = text.index( '�', pos+1 )
37
- end
38
-
39
- [text,errors] ## return text and errors (list)
40
- end
41
-
42
-
43
-
44
- def values_to_csv( values )
45
- buf = ""
46
- values.each_with_index do |value,i|
47
- buf << ',' if i > 0 ## add comma (except for first value)
48
- ## note: allow optional $ sign e.g. $100,000,000
49
- ## !!!! todo/fix: allow optional minus e.g. -44,000
50
- if value =~ /^\$?[1-9][,0-9]+[0-9]$/ ### find a better regex - why? why not??
51
- ## check if number e.g. 17,098,242 or $17,098,242
52
- ## remove commas 17098242
53
- buf << value.gsub( ',', '' )
54
- elsif value.index( ',').nil?
55
- ## add as is 1:1 (no commana)
56
- buf << value
57
- else
58
- ## escape comma with double quote
59
- # e.g. Guam, The becomes "Guam, The"
60
- buf << '"'
61
- buf << value
62
- buf << '"'
63
- end
64
- end
65
- buf
66
- end
67
-
68
-
69
- def data_to_csv( recs, headers )
70
- text = ""
71
-
72
- text << values_to_csv( headers )
73
- text << "\n"
74
-
75
- recs.each do |rec|
76
- text << values_to_csv( rec )
77
- text << "\n"
78
- end
79
-
80
- text
81
- end
82
-
83
-
84
- end # module Utils
85
- end # module Factbook
@@ -1,129 +0,0 @@
1
- # encoding: utf-8
2
-
3
- module Factbook
4
- module Utils
5
-
6
- #######
7
- ## find meta data (about page info)
8
-
9
-
10
- #### e.g. Page last updated on September 16, 2015
11
-
12
- MONTH_EN_TO_S={
13
- 'January' => '1',
14
- 'February' => '2',
15
- 'March' => '3',
16
- 'April' => '4',
17
- 'May' => '5',
18
- 'June' => '6',
19
- 'July' => '7',
20
- 'August' => '8',
21
- 'September' => '9',
22
- 'October' => '10',
23
- 'November' => '11',
24
- 'December' => '12'
25
- }
26
-
27
-
28
-
29
- ##
30
- # examples (to match):
31
- # Page last updated on November 03, 2016
32
- # Page last updated on September 24, 2015
33
-
34
- PAGE_LAST_UPDATED_REGEX = /
35
- Page \s last \s updated \s on \s
36
- (?<month_en>[a-z]+) \s
37
- (?<day>\d{1,2}), \s
38
- (?<year>\d{4})
39
- /imx
40
-
41
- def find_page_last_updated( html )
42
- m = PAGE_LAST_UPDATED_REGEX.match( html )
43
- if m
44
- pp m
45
- month_en = m[:month_en]
46
- day = m[:day]
47
- year = m[:year]
48
- puts "** bingo - month #{month_en}, day #{day}, year #{year}"
49
-
50
- month = MONTH_EN_TO_S[ month_en ]
51
- date_str = "#{year}-#{month}-#{day}"
52
- pp date_str
53
- date = Date.strptime( date_str, '%Y-%m-%d' )
54
- date
55
- else
56
- nil
57
- end
58
- end
59
-
60
-
61
- ## fallback: find "standalone" country coude
62
- ## e.g.
63
- ## ccode='au'
64
-
65
- COUNTRY_CODE_REGEX = /ccode='(?<cc>[a-z]+)'/
66
-
67
- def find_country_code( html )
68
- m = COUNTRY_CODE_REGEX.match( html )
69
- if m
70
- pp m
71
- cc = m[:cc]
72
- puts "** bingo - country code #{cc}"
73
- cc
74
- else
75
- nil
76
- end
77
- end
78
-
79
-
80
- ##
81
- ## e.g. regioncode="eur"
82
- ## countrycode="au"
83
- ## countryname="Austria"
84
- ## flagsubfield=""
85
- ## countryaffiliation=""
86
- ## flagdescription=""
87
- ## flagdescriptionnote=""
88
- ## region="Europe"
89
- ##
90
- ## note: countryaffiliation may be empty
91
-
92
-
93
-
94
- PAGE_INFO_REGEX = /
95
- regioncode=(?<q1>"|')(?<region_code>.+?)\k<q1>
96
- \s+
97
- countrycode=(?<q2>"|')(?<country_code>.+?)\k<q2> ## is k<3> backref
98
- \s+
99
- countryname=(?<q3>"|')(?<country>.+?)\k<q3>
100
- \s+
101
- [^>]+? ## allow any attribs (note: non-greedy)
102
- countryaffiliation=(?<q4>"|')(?<affiliation>.*?)\k<q4> ## note: might be empty
103
- \s+
104
- [^>]+? ## allow any attribs (note: non-greedy)
105
- region=(?<q5>"|')(?<region>.+?)\k<q5> ## check world - might be empty ?? or for ocean ??
106
- /imx
107
-
108
-
109
- def find_page_info( html )
110
- m = PAGE_INFO_REGEX.match( html )
111
- if m
112
- pp m
113
-
114
- h = { country_code: m[:country_code],
115
- country_name: m[:country],
116
- country_affiliation: m[:affiliation],
117
- region_code: m[:region_code],
118
- region_name: m[:region] }
119
-
120
- puts "** bingo - #{h.inspect}"
121
- h ## return hash w/ name-value pairs
122
- else
123
- nil ## or return empty struct with nils/empty strings - why?? why not??
124
- end
125
- end
126
-
127
-
128
- end # module Utils
129
- end # module Factbook
@@ -1,21 +0,0 @@
1
-
2
- module Factbook
3
-
4
- MAJOR = 2
5
- MINOR = 0
6
- PATCH = 0
7
- VERSION = [MAJOR,MINOR,PATCH].join('.')
8
-
9
- def self.version
10
- VERSION
11
- end
12
-
13
- def self.banner
14
- "factbook/#{VERSION} on Ruby #{RUBY_VERSION} (#{RUBY_RELEASE_DATE}) [#{RUBY_PLATFORM}]"
15
- end
16
-
17
- def self.root
18
- File.expand_path( File.dirname(File.dirname(File.dirname(__FILE__))) )
19
- end
20
-
21
- end
@@ -1,48 +0,0 @@
1
- # encoding: utf-8
2
- #
3
- # use to run:
4
- # ruby -I ./lib script/almanac.rb
5
-
6
-
7
- require 'factbook'
8
-
9
-
10
- TEMPLATE = <<EOS
11
-
12
- ### <%= names %>
13
-
14
- <%= page.name_long=='none' ? '\-' : page.name_long %> › <%= page.map %> -- <%= page.location %> <br>
15
- <%= page.capital %> • <%= page.area %> • pop. <%= page.population %>
16
-
17
- **Languages:** <%= page.languages %>
18
- **Major cities:** <%= page.major_cities %>
19
- **Ethnic groups:** <%= page.ethnic_groups %>
20
- **Religions:** <%= page.religions %>
21
- **Independence:** <%= page.independence %>
22
-
23
- **Internet:** `<%= page.internet %>` • <%= page.internet_users %> • <%= page.internet_users_rate %>
24
- **Telephones - mobile:** <%= page.telephones_mobile %> • <%= page.telephones_mobile_subscriptions %> subs./100
25
-
26
- EOS
27
-
28
-
29
- #########################
30
- ### read all countries
31
- ### using local json (dump) files
32
-
33
- ## see github.com/factbook/factbook.json (use git clone)
34
- json_dir = '../../factbook/factbook.json'
35
- codes = Factbook.codes.countries
36
- ## todo: add tawain and ?? from others - why, why not??
37
-
38
- pages = Factbook::JsonPageReader.new( json_dir ).read_pages( codes )
39
-
40
- almanac = Factbook::Almanac.new( pages )
41
-
42
- ## save to disk
43
-
44
- File.open( './tmp/ALMANAC.md', 'w' ) do |f|
45
- f.write almanac.render( TEMPLATE )
46
- end
47
-
48
- puts "Done."
@@ -1,34 +0,0 @@
1
- # encoding: utf-8
2
- #
3
- # use to run:
4
- # ruby -I ./lib script/attributes.rb
5
-
6
-
7
- # e.g. prints attribute accessor shortcuts
8
- #
9
- # ### Geography
10
- #
11
- # - `location` => Location
12
- # - `coords` => Geographic coordinates
13
- # - `map` => Map references
14
- # ...
15
-
16
- require 'factbook'
17
-
18
-
19
- attribs= Factbook.attributes.to_a
20
-
21
- h = attribs.group_by { |a| a.category }
22
-
23
- pp h
24
-
25
- h.each do |k,v|
26
- puts ""
27
- puts "### #{k}"
28
- puts ""
29
-
30
- v.each do |a|
31
- puts "- `#{a.name}` => #{a.path.join(' › ')}"
32
- end
33
- end
34
-
@@ -1,28 +0,0 @@
1
- # encoding: utf-8
2
- #
3
- # use to run/test:
4
- # ruby -I ./lib script/build.rb
5
-
6
- require 'factbook'
7
-
8
- DB_CONFIG = {
9
- adapter: 'sqlite3',
10
- database: './factbook.db'
11
- }
12
-
13
- ActiveRecord::Base.logger = Logger.new( STDOUT )
14
- ActiveRecord::Base.establish_connection( DB_CONFIG )
15
-
16
- Factbook::CreateDb.new.up ## create tables
17
-
18
- importer = Factbook::Importer.new
19
-
20
- Factbook.codes.each do |code|
21
- puts "Fetching #{code.code}- #{code.name}..."
22
- page = Factbook::Page.new( code.code )
23
-
24
- puts "Adding #{code.code}- #{code.name}..."
25
- importer.import( page )
26
- end
27
-
28
- puts "Done."
@@ -1,145 +0,0 @@
1
- # encoding: utf-8
2
- #
3
- # use to run:
4
- # ruby -I ./lib script/counter.rb
5
-
6
- require 'factbook'
7
-
8
-
9
- c = Factbook::Counter.new
10
-
11
- ## see github.com/factbook/factbook.json (use git clone)
12
- json_dir = '../../factbook/factbook.json'
13
- codes = Factbook.codes
14
-
15
- pages = Factbook::JsonPageReader.new( json_dir ).read_pages( codes )
16
-
17
- pages.each do |page|
18
- c.count( page )
19
- end
20
-
21
- h = c.data
22
- pp h
23
-
24
- ### save to json
25
- puts "saving a copy to categories.json for debugging"
26
- File.open( "tmp/categories.json", 'w' ) do |f|
27
- f.write JSON.pretty_generate( h )
28
- end
29
-
30
-
31
-
32
- SKIP_CATEGORIES_LINES=<<EOS
33
-
34
- ######
35
- ### france plus 5 overseas regions/departments
36
-
37
- ## metropolitan France
38
- ## metropolitan France - total
39
- overseas departments
40
- French Guiana
41
- French Guiana - total
42
- Guadeloupe
43
- Guadeloupe and Martinique
44
- Martinique
45
- Mayotte
46
- Reunion
47
-
48
-
49
- ###############
50
- ### more
51
-
52
- Iles Eparses
53
- Ile Amsterdam
54
- Ile Amsterdam (Ile Amsterdam et Ile Saint-Paul)
55
- Ile Amsterdam et Ile Saint-Paul
56
- Ile Saint Paul
57
- Ile Saint-Paul (Ile Amsterdam et Ile Saint-Paul)
58
- Iles Crozet
59
- Iles Kerguelen
60
- Adelie Land
61
- Bassas da India
62
- Bassas da India (Iles Eparses)
63
- Bassas da India, Europa Island, Glorioso Islands, Juan de Nova Island (Iles Eparses)
64
- Europa Island
65
- Europa Island (Iles Eparses)
66
- Europa Island, Glorioso Islands, Juan de Nova Island
67
- Europa Island and Juan de Nova Island (Iles Eparses)
68
- Europa Island, Glorioso Islands, Juan de Nova Island (Iles Eparses)
69
- Glorioso Islands
70
- Glorioso Islands (Iles Eparses)
71
- Glorioso Island (Iles Eparses)
72
- Juan de Nova Island
73
- Juan de Nova Island (Iles Eparses)
74
- Tromelin Island
75
- Tromelin Island (Iles Eparses)
76
- Saint Helena
77
- Ascension Island
78
- Ascension
79
- Tristan da Cunha
80
- Tristan da Cunha island group
81
- Baker Island
82
- Baker, Howland, and Jarvis Islands
83
- Baker, Howland, and Jarvis Islands, and Johnston Atoll
84
- Baker, Howland, and Jarvis Islands, and Kingman Reef
85
- Howland Island
86
- Jarvis Island
87
- Johnston Atoll
88
- Johnston Atoll and Kingman Reef
89
- Kingman Reef
90
- Midway Islands
91
- Midway Islands, Johnston, and Palmyra Atolls
92
- Midway Islands and Palmyra Atoll
93
- Palmyra Atoll
94
- note on Palmyra Atoll
95
- EOS
96
-
97
- ## allow empty lines and skip comments
98
- SKIP_CATEGORIES = SKIP_CATEGORIES_LINES.split("\n").select { |item| !(item =~ /^\s*$/ || item =~ /^\s*#/) }
99
-
100
-
101
- def print_categories( data )
102
- data.each do |k,v|
103
-
104
- puts ""
105
- puts "## #{k} _(#{v[:count]})_"
106
- puts ""
107
-
108
- walk_categories( v, 1 )
109
- end
110
- end
111
-
112
- def walk_categories( data, level )
113
- data.each do |k,v|
114
- next if k == :count || k == :codes ## skip "virtual" count entry (added for stats)
115
-
116
- ## skip (sub)country entries e.g. Baker Island, Ile Amsterdam, etc.
117
- next if SKIP_CATEGORIES.include?( k )
118
-
119
- print " " * (level-1) if level > 1 ## add 4 spaces indents per extra level
120
- print "- "
121
-
122
- print "**" if level == 1 ## mark as bold
123
- print k
124
- print "**" if level == 1
125
-
126
- print " _("
127
- print v[:count]
128
- if v[:codes] ## add codes if present
129
- print " - "
130
- print v[:codes]
131
- end
132
- print ")_"
133
-
134
- print "\n"
135
-
136
- walk_categories( v, level+1)
137
- end
138
- end
139
-
140
-
141
-
142
- print_categories( c.data )
143
-
144
- puts "Done."
145
-