factbook-readers 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (58) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +3 -0
  3. data/Manifest.txt +56 -0
  4. data/README.md +196 -0
  5. data/Rakefile +34 -0
  6. data/data/attributes.yml +337 -0
  7. data/data/categories.csv +164 -0
  8. data/data/codes.csv +262 -0
  9. data/data/codesxref.csv +280 -0
  10. data/data/comparisons.csv +75 -0
  11. data/lib/factbook-readers.rb +59 -0
  12. data/lib/factbook-readers/attributes.rb +74 -0
  13. data/lib/factbook-readers/builder.rb +212 -0
  14. data/lib/factbook-readers/builder_item.rb +185 -0
  15. data/lib/factbook-readers/builder_json.rb +79 -0
  16. data/lib/factbook-readers/codes.rb +122 -0
  17. data/lib/factbook-readers/comparisons.rb +50 -0
  18. data/lib/factbook-readers/counter.rb +48 -0
  19. data/lib/factbook-readers/normalize.rb +43 -0
  20. data/lib/factbook-readers/page.rb +148 -0
  21. data/lib/factbook-readers/page_info.rb +12 -0
  22. data/lib/factbook-readers/reader_json.rb +51 -0
  23. data/lib/factbook-readers/sanitizer.rb +307 -0
  24. data/lib/factbook-readers/sect.rb +29 -0
  25. data/lib/factbook-readers/subsect.rb +18 -0
  26. data/lib/factbook-readers/table.rb +52 -0
  27. data/lib/factbook-readers/utils.rb +47 -0
  28. data/lib/factbook-readers/utils_info.rb +129 -0
  29. data/lib/factbook-readers/version.rb +24 -0
  30. data/lib/factbook/readers.rb +5 -0
  31. data/test/data/au.html +579 -0
  32. data/test/data/au.yml +8 -0
  33. data/test/data/be.html +596 -0
  34. data/test/data/be.yml +8 -0
  35. data/test/data/json/au.json +892 -0
  36. data/test/data/src/ag.html +716 -0
  37. data/test/data/src/au-2015-09-24.html +2006 -0
  38. data/test/data/src/au.html +658 -0
  39. data/test/data/src/be-2015-09-24.html +2011 -0
  40. data/test/data/src/be.html +648 -0
  41. data/test/helper.rb +11 -0
  42. data/test/test_attribs.rb +87 -0
  43. data/test/test_attribs_def.rb +20 -0
  44. data/test/test_builder.rb +35 -0
  45. data/test/test_codes.rb +76 -0
  46. data/test/test_comparisons.rb +19 -0
  47. data/test/test_convert.rb +30 -0
  48. data/test/test_counter.rb +31 -0
  49. data/test/test_fields.rb +52 -0
  50. data/test/test_importer.rb +56 -0
  51. data/test/test_item_builder.rb +99 -0
  52. data/test/test_json.rb +45 -0
  53. data/test/test_json_builder.rb +25 -0
  54. data/test/test_normalize.rb +23 -0
  55. data/test/test_page.rb +38 -0
  56. data/test/test_sanitizer.rb +39 -0
  57. data/test/test_sanitizer_regex.rb +89 -0
  58. metadata +196 -0
@@ -0,0 +1,79 @@
1
+ # encoding: utf-8
2
+
3
+ module Factbook
4
+
5
+ ######
6
+ # json builder -- lets us rebuild a page from "dumped" json (instead of parsing html page)
7
+
8
+ class JsonBuilder
9
+ include LogUtils::Logging
10
+ include NormalizeHelper ## e.g. normalize_category
11
+
12
+
13
+ def self.from_file( path )
14
+ text = File.read( path ) ## fix: use File.read_utf8 from textutils
15
+ self.from_string( text )
16
+ end
17
+
18
+ def self.from_string( text )
19
+ self.new( text )
20
+ end
21
+
22
+
23
+ attr_reader :text,
24
+ :json,
25
+ :info, ## not used yet -- page info incl. country_name, region_name, last_updated etc.
26
+ :errors, ## not used yet -- encoding erros etc.
27
+ :sects
28
+
29
+
30
+ def initialize( text )
31
+ @text = text
32
+
33
+ @json = JSON.parse( text )
34
+
35
+ @info = nil ## fix/todo: sorry - for now no page info (use header in json - why? why not??)
36
+ @errors = [] ## fix/todo: sorry - for now no errors possible/tracked
37
+
38
+ @sects = []
39
+
40
+ @json.each do |k1,v1|
41
+ sect_title = k1
42
+ sect_subsects = v1
43
+
44
+ sect = Sect.new
45
+ sect.title = sect_title
46
+
47
+ ## get subsections
48
+ subsects = []
49
+ sect_subsects.each do |k2,v2|
50
+ subsect_title = k2
51
+ subsect_data = v2
52
+
53
+ subsect = Subsect.new
54
+ subsect.title = subsect_title
55
+
56
+ #####
57
+ ## note: run data hash through normalize_category (again)
58
+ if subsect_data.is_a?( Hash )
59
+ new_subsect_data = {}
60
+ subsect_data.each do |k3,v3|
61
+ new_subsect_data[ normalize_category(k3) ] = v3
62
+ end
63
+ subsect_data = new_subsect_data
64
+ end
65
+
66
+ subsect.data = subsect_data
67
+
68
+ subsects << subsect
69
+ end
70
+
71
+ sect.subsects = subsects
72
+ @sects << sect
73
+ end
74
+ end
75
+
76
+ end # class JsonBuilder
77
+
78
+
79
+ end # module Factbook
@@ -0,0 +1,122 @@
1
+ # encoding: utf-8
2
+
3
+ ##
4
+ # note:
5
+ # the factbook category/region for world is other entities (on FAQ) and oceans in page
6
+ # changed to world
7
+
8
+
9
+ module Factbook
10
+
11
+ class Codes
12
+
13
+ Code = Struct.new( :code, ## todo: add notes (country affiliation) - why? why not??
14
+ :name,
15
+ :category, ## e.g. Countries, Other, Oceans, World, Dependencies, etc.
16
+ :region, ## e.g. Europe, Oceans, etc.
17
+ )
18
+
19
+ def self.from_csv( path )
20
+ ###
21
+ # note:
22
+ # if you use quotes - NO leading spaces allowed e.g.
23
+ # use au,"Austria",... and NOT
24
+ # au, "Austria", ...
25
+ #
26
+ # for headers - NO leading spaces allowed e.g.
27
+ # use Code,Name,Category,Region,... and NOT
28
+ # Code, Name, Category, Region, ...
29
+
30
+ rows = CsvHash.read( path )
31
+
32
+ pp rows
33
+
34
+ recs = []
35
+ rows.each do |row|
36
+ pp row
37
+ rec = Code.new
38
+ rec.code = row['Code'].strip ## remove leading n trailing whitespaces
39
+ rec.name = row['Name'].strip
40
+
41
+ ## note: for now category and region are optional
42
+ rec.category = row['Category'].strip if row['Category'] && row['Category'].size > 0
43
+ rec.region = row['Region'].strip if row['Region'] && row['Region'].size > 0
44
+
45
+ pp rec
46
+ recs << rec
47
+ end
48
+
49
+ self.new( recs )
50
+ end
51
+
52
+ def initialize( codes )
53
+ @codes = codes
54
+ end
55
+
56
+ def size() @codes.size; end
57
+
58
+ def each( &blk ) @codes.each( &blk ); end
59
+ def select( &blk )
60
+ codes = @codes.select( &blk )
61
+ Codes.new( codes ) ## return (again) new Codes obj for easy-chaining - why? why not?
62
+ end
63
+
64
+
65
+ def to_a
66
+ @codes.collect {|code| code.code } ## return array of codes
67
+ end
68
+
69
+ ## def all() self.to_a; end ## note: alias for to_a - use - why? why not??
70
+
71
+ ## "pre-defined" convenience shortcuts
72
+ def countries() category 'Countries'; end
73
+ def world() category 'World'; end
74
+ def oceans() category 'Oceans'; end
75
+ def misc() category 'Miscellaneous'; end
76
+ def others() category 'Other'; end
77
+ def dependencies() category 'Dependencies'; end
78
+ def dependencies_us() category 'Dependencies (United States)'; end
79
+ ## fix/todo: add all dependencies uk (or gb?), fr,cn,au,nz,no,dk,etc.
80
+
81
+ def europe() region 'Europe'; end
82
+ def south_asia() region 'South Asia'; end
83
+ def central_asia() region 'Central Asia'; end
84
+ def east_n_souteast_asia() region 'East & Southeast Asia'; end
85
+ def middle_east() region 'Middle East'; end
86
+ def africa() region 'Africa'; end
87
+ def north_america() region 'North America'; end
88
+ def central_america_n_caribbean() region 'Central America and Caribbean'; end
89
+ def south_america() region 'South America'; end
90
+ def australia_oceania() region 'Australia-Oceania'; end
91
+ def antartica() region 'Antarctica'; end
92
+
93
+ ## note: regions oceans and world - same as category oceans and world
94
+ ## use oceans_ii or world_ii or something ??
95
+ ## use category('World') n region('World')
96
+ ## use category('Oceans') n region('Oceans')
97
+
98
+
99
+ def category( query )
100
+ ## todo/future: allow passing in of regex too (not just string)
101
+ ## note: e.g. Dependencies (France) needs to get escpaed to
102
+ ## Dependencies \(France\) etc.
103
+ filter_regex = /#{Regexp.escape(query)}/i
104
+ codes = @codes.select do |code|
105
+ code.category ? filter_regex.match( code.category ) : false ## note: allow nil for category; will fail on search
106
+ end
107
+ Codes.new( codes ) ## return new Codes obj for easy-chaining
108
+ end
109
+
110
+ def region( query )
111
+ ## todo/future: allow passing in of regex too (not just string)
112
+ filter_regex = /#{Regexp.escape(query)}/i
113
+ codes = @codes.select do |code|
114
+ code.region ? filter_regex.match( code.region ) : false ## note: allow nil for region; will fail on search
115
+ end
116
+ Codes.new( codes ) ## return new Codes obj for easy-chaining
117
+ end
118
+
119
+ end # class codes
120
+
121
+ end # module Factbook
122
+
@@ -0,0 +1,50 @@
1
+ # encoding: utf-8
2
+
3
+ module Factbook
4
+
5
+ class Comparisons
6
+
7
+ Comparison = Struct.new( :num, ### todo: use no or id or something - why? why not?
8
+ :category, ## e.g. Geography, People, Economy, etc.
9
+ :name,
10
+ )
11
+
12
+ def self.from_csv( path )
13
+
14
+ rows = CsvHash.read( path )
15
+
16
+ pp rows
17
+
18
+ recs = []
19
+ rows.each do |row|
20
+ pp row
21
+ rec = Comparison.new
22
+ rec.num = row['Num'].strip.to_i ## remove leading n trailing whitespaces
23
+ rec.category = row['Category'].strip
24
+ rec.name = row['Name'].strip
25
+
26
+ pp rec
27
+ recs << rec
28
+ end
29
+
30
+ self.new( recs )
31
+ end
32
+
33
+ def initialize( comps )
34
+ @comps = comps
35
+ end
36
+
37
+ def size() @comps.size; end
38
+
39
+ def each
40
+ @comps.each {|comp| yield( comp ) }
41
+ end
42
+
43
+ def to_a
44
+ @comps.collect {|comp| comp.num } ## return array of nums -- return something else - why? why not?
45
+ end
46
+
47
+ end # class Comparison
48
+
49
+ end # module Factbook
50
+
@@ -0,0 +1,48 @@
1
+ # encoding: utf-8
2
+
3
+ module Factbook
4
+
5
+ class Counter
6
+
7
+ attr_reader :data
8
+
9
+ def initialize
10
+ @data = {}
11
+ end
12
+
13
+ def count( page )
14
+
15
+ ## walk page data hash
16
+ # add nodes to data
17
+
18
+ walk( page, page.data, @data )
19
+ end
20
+
21
+
22
+ private
23
+ def walk( page, hin, hout )
24
+ hin.each do |k,v|
25
+ if v.is_a? Hash
26
+ hout2 = hout[k] || { count: 0, codes: '' }
27
+
28
+ hout2[ :count ] += 1
29
+
30
+ ## delete codes if larger (treshhold) than x (e.g. 9)
31
+ hout2.delete( :codes ) if hout2[ :count ] > 9
32
+
33
+ codes = hout2[ :codes ]
34
+ if codes ## note: might got deleted if passed treshhold (e.g. 9 entries)
35
+ codes << ' ' unless codes.empty? ## add separator (space for now)
36
+ codes << page.info.country_code
37
+ hout2[ :codes ] = codes
38
+ end
39
+
40
+ hout[k] = hout2
41
+ walk( page, v, hout2 )
42
+ end
43
+ end
44
+ end
45
+
46
+ end # class Counter
47
+
48
+ end # module Factbook
@@ -0,0 +1,43 @@
1
+ # encoding: utf-8
2
+
3
+ module Factbook
4
+ module NormalizeHelper
5
+
6
+
7
+ def normalize_category( text )
8
+
9
+ ## note: fix typos/errors with double colons e.g. note:: (instead of note:)
10
+
11
+ text = text.strip
12
+ text = text.sub( /:+\z/, '' ) # remove trailing : if present -- note: allow (fix) note:: too, thus, use :+
13
+ text = text.strip
14
+
15
+ #######################################
16
+ ### special cases
17
+
18
+ ## typos e.g ntoe => use note
19
+ text = 'note' if text == 'ntoe'
20
+ text = 'investment in fixed capital' if text == 'investment if fixed capital'
21
+
22
+ ## downcase
23
+ text = 'lowest point' if text == 'Lowest point'
24
+ text = 'chief of state' if text == 'Chief of state'
25
+
26
+ ## spelling variant (use more popular one)
27
+ text = 'signed, but not ratified' if text == 'signed but not ratified'
28
+ text = 'vectorborne diseases' if text == 'vectorborne disease'
29
+ text = 'water contact disease' if text == 'water contact diseases'
30
+ text = 'food or waterborne diseases' if text == 'food or waterborne disease'
31
+ text = 'geographic coordinates' if text == 'geographical coordinates'
32
+ text = 'note' if text == 'notes'
33
+ text = 'refugees (country of origin)' if text == 'refugees (countries of origin)'
34
+
35
+ ## border countries (8): -- remove (x) counter
36
+ text = 'border countries' if text.start_with?( 'border countries')
37
+
38
+ text
39
+ end
40
+
41
+
42
+ end # module NormalizeHelper
43
+ end # module Factbook
@@ -0,0 +1,148 @@
1
+
2
+ module Factbook
3
+
4
+
5
+ ## note:
6
+ ## some factbook pages with chrome (headers, footers, etc.)
7
+ ## are NOT valid utf-8, thus,
8
+ ## treat page as is (e.g. ASCII8BIT)
9
+ #
10
+ # only convert to utf8 when header and footer got stripped
11
+
12
+ ##
13
+ ## be/benin:
14
+ ## Key Force or FC [Lazare S?xx?HOU?xx?TO] -- two invalid byte code chars in Political parties and leaders:
15
+ #
16
+ ## in Western/Windows-1252 leads to FC [Lazare SÈHOUÉTO];
17
+ # Lazare Sèhouéto
18
+ #
19
+ # looks good - use (assume) Windows-1252 ????
20
+
21
+ ##
22
+ # check for is ascii 7-bit ??? if yes -noworries
23
+ # if not, log number of chars not using ascii 7-bit
24
+
25
+
26
+
27
+ class Page
28
+ include LogUtils::Logging
29
+
30
+ attr_reader :sects ## "structured" access e.g. sects/subsects/etc.
31
+ attr_reader :info ## meta info e.g. country_code, country_name, region_name, last_updated, etc.
32
+ attr_reader :data ## "plain" access with vanilla hash
33
+
34
+
35
+ ## standard version (note: requires https)
36
+ SITE_BASE = 'https://www.cia.gov/library/publications/the-world-factbook/geos/{code}.html'
37
+
38
+ def initialize( code, opts={} )
39
+ ### keep code - why? why not?? (use page_info/info e.g. info.country_code??)
40
+
41
+ if opts[:json]
42
+ json = opts[:json] ## note: json is (still) a string/text (NOT yet parsed to structured data)
43
+ b = JsonBuilder.from_string( json )
44
+ else ## assume html
45
+ if opts[:html] ## note: expects ASCII-7BIT/BINARY encoding
46
+ ## for debugging and testing allow "custom" passed-in html page
47
+ html = opts[:html]
48
+ else
49
+ url_string = SITE_BASE.gsub( '{code}', code )
50
+ ## note: expects ASCII-7BIT/BINARY encoding
51
+
52
+ ## html = fetch_page( url_string ) ## use PageFetcher class - why?? why not??
53
+ html = Webcache.read( url_string )
54
+ end
55
+ b = Builder.from_string( html )
56
+ end
57
+
58
+ @sects = b.sects
59
+ @info = b.info
60
+
61
+ ## todo/fix/quick hack:
62
+ ## check for info opts hash entry - lets you overwrite page info
63
+ ## -- use proper header to setup page info - why, why not??
64
+ if opts[:info]
65
+ info = opts[:info]
66
+ @info = info
67
+ end
68
+
69
+ @data = {}
70
+ @sects.each do |sect|
71
+ @data[ sect.title ] = sect.data
72
+ end
73
+
74
+ self ## return self (check - not needed??)
75
+ end
76
+
77
+
78
+ def to_json( opts={} ) ## convenience helper for data.to_json; note: pretty print by default!
79
+ if opts[:minify]
80
+ data.to_json
81
+ else
82
+ ## was: -- opts[:pretty] || opts[:pp]
83
+ JSON.pretty_generate( data ) ## note: pretty print by default!
84
+ end
85
+ end
86
+
87
+
88
+ def [](key) ### convenience shortcut
89
+ # lets you use
90
+ # page['geo']
91
+ # instead of
92
+ # page.data['geo']
93
+
94
+ ## fix: use delegate data, [] from forwardable lib - why?? why not??
95
+
96
+ data[key]
97
+ end
98
+
99
+ ## add convenience (shortcut) accessors / attributes / fields / getters
100
+
101
+ ATTRIBUTES.each do |attrib|
102
+ ## e.g.
103
+ ## def background() data['Introduction']['Background']['text']; end
104
+ ## def location() data['Geography']['Location']['text']; end
105
+ ## etc.
106
+ if attrib.path.size == 1
107
+ define_method attrib.name.to_sym do
108
+ @data.fetch( attrib.category, {} ).
109
+ fetch( attrib.path[0], {} )['text']
110
+ end
111
+ else ## assume size 2 for now
112
+ define_method attrib.name.to_sym do
113
+ @data.fetch( attrib.category, {} ).
114
+ fetch( attrib.path[0], {} ).
115
+ fetch( attrib.path[1], {} )['text']
116
+ end
117
+ end
118
+ end
119
+
120
+
121
+ private
122
+ def fetch_page( url )
123
+ response = Webget.page( url )
124
+
125
+ ## note: exit on get / fetch error - do NOT continue for now - why? why not?
126
+ exit 1 if response.status.nok? ## e.g. HTTP status code != 200
127
+
128
+
129
+ response.text
130
+ end
131
+
132
+
133
+ =begin
134
+ def self.from_url( cc, cn )
135
+ html_ascii = PageFetcher.new.fetch( cc )
136
+ self.new( cc, cn, html_ascii )
137
+ end
138
+
139
+ def self.from_file( cc, cn, opts={} )
140
+ input_dir = opts[:input_dir] || '.'
141
+ html_ascii = File.read( "#{input_dir}/#{cc}.html" ) ## fix/todo: use ASCII8BIT/binary reader
142
+ self.new( cc, cn, html_ascii )
143
+ end
144
+ =end
145
+
146
+
147
+ end # class Page
148
+ end # module Factbook