factbook-readers 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +3 -0
  3. data/Manifest.txt +56 -0
  4. data/README.md +196 -0
  5. data/Rakefile +34 -0
  6. data/data/attributes.yml +337 -0
  7. data/data/categories.csv +164 -0
  8. data/data/codes.csv +262 -0
  9. data/data/codesxref.csv +280 -0
  10. data/data/comparisons.csv +75 -0
  11. data/lib/factbook-readers.rb +59 -0
  12. data/lib/factbook-readers/attributes.rb +74 -0
  13. data/lib/factbook-readers/builder.rb +212 -0
  14. data/lib/factbook-readers/builder_item.rb +185 -0
  15. data/lib/factbook-readers/builder_json.rb +79 -0
  16. data/lib/factbook-readers/codes.rb +122 -0
  17. data/lib/factbook-readers/comparisons.rb +50 -0
  18. data/lib/factbook-readers/counter.rb +48 -0
  19. data/lib/factbook-readers/normalize.rb +43 -0
  20. data/lib/factbook-readers/page.rb +148 -0
  21. data/lib/factbook-readers/page_info.rb +12 -0
  22. data/lib/factbook-readers/reader_json.rb +51 -0
  23. data/lib/factbook-readers/sanitizer.rb +307 -0
  24. data/lib/factbook-readers/sect.rb +29 -0
  25. data/lib/factbook-readers/subsect.rb +18 -0
  26. data/lib/factbook-readers/table.rb +52 -0
  27. data/lib/factbook-readers/utils.rb +47 -0
  28. data/lib/factbook-readers/utils_info.rb +129 -0
  29. data/lib/factbook-readers/version.rb +24 -0
  30. data/lib/factbook/readers.rb +5 -0
  31. data/test/data/au.html +579 -0
  32. data/test/data/au.yml +8 -0
  33. data/test/data/be.html +596 -0
  34. data/test/data/be.yml +8 -0
  35. data/test/data/json/au.json +892 -0
  36. data/test/data/src/ag.html +716 -0
  37. data/test/data/src/au-2015-09-24.html +2006 -0
  38. data/test/data/src/au.html +658 -0
  39. data/test/data/src/be-2015-09-24.html +2011 -0
  40. data/test/data/src/be.html +648 -0
  41. data/test/helper.rb +11 -0
  42. data/test/test_attribs.rb +87 -0
  43. data/test/test_attribs_def.rb +20 -0
  44. data/test/test_builder.rb +35 -0
  45. data/test/test_codes.rb +76 -0
  46. data/test/test_comparisons.rb +19 -0
  47. data/test/test_convert.rb +30 -0
  48. data/test/test_counter.rb +31 -0
  49. data/test/test_fields.rb +52 -0
  50. data/test/test_importer.rb +56 -0
  51. data/test/test_item_builder.rb +99 -0
  52. data/test/test_json.rb +45 -0
  53. data/test/test_json_builder.rb +25 -0
  54. data/test/test_normalize.rb +23 -0
  55. data/test/test_page.rb +38 -0
  56. data/test/test_sanitizer.rb +39 -0
  57. data/test/test_sanitizer_regex.rb +89 -0
  58. metadata +196 -0
@@ -0,0 +1,79 @@
1
+ # encoding: utf-8
2
+
3
+ module Factbook
4
+
5
+ ######
6
+ # json builder -- lets us rebuild a page from "dumped" json (instead of parsing html page)
7
+
8
+ class JsonBuilder
9
+ include LogUtils::Logging
10
+ include NormalizeHelper ## e.g. normalize_category
11
+
12
+
13
+ def self.from_file( path )
14
+ text = File.read( path ) ## fix: use File.read_utf8 from textutils
15
+ self.from_string( text )
16
+ end
17
+
18
+ def self.from_string( text )
19
+ self.new( text )
20
+ end
21
+
22
+
23
+ attr_reader :text,
24
+ :json,
25
+ :info, ## not used yet -- page info incl. country_name, region_name, last_updated etc.
26
+ :errors, ## not used yet -- encoding erros etc.
27
+ :sects
28
+
29
+
30
+ def initialize( text )
31
+ @text = text
32
+
33
+ @json = JSON.parse( text )
34
+
35
+ @info = nil ## fix/todo: sorry - for now no page info (use header in json - why? why not??)
36
+ @errors = [] ## fix/todo: sorry - for now no errors possible/tracked
37
+
38
+ @sects = []
39
+
40
+ @json.each do |k1,v1|
41
+ sect_title = k1
42
+ sect_subsects = v1
43
+
44
+ sect = Sect.new
45
+ sect.title = sect_title
46
+
47
+ ## get subsections
48
+ subsects = []
49
+ sect_subsects.each do |k2,v2|
50
+ subsect_title = k2
51
+ subsect_data = v2
52
+
53
+ subsect = Subsect.new
54
+ subsect.title = subsect_title
55
+
56
+ #####
57
+ ## note: run data hash through normalize_category (again)
58
+ if subsect_data.is_a?( Hash )
59
+ new_subsect_data = {}
60
+ subsect_data.each do |k3,v3|
61
+ new_subsect_data[ normalize_category(k3) ] = v3
62
+ end
63
+ subsect_data = new_subsect_data
64
+ end
65
+
66
+ subsect.data = subsect_data
67
+
68
+ subsects << subsect
69
+ end
70
+
71
+ sect.subsects = subsects
72
+ @sects << sect
73
+ end
74
+ end
75
+
76
+ end # class JsonBuilder
77
+
78
+
79
+ end # module Factbook
@@ -0,0 +1,122 @@
1
+ # encoding: utf-8
2
+
3
+ ##
4
+ # note:
5
+ # the factbook category/region for world is other entities (on FAQ) and oceans in page
6
+ # changed to world
7
+
8
+
9
+ module Factbook
10
+
11
+ class Codes
12
+
13
+ Code = Struct.new( :code, ## todo: add notes (country affiliation) - why? why not??
14
+ :name,
15
+ :category, ## e.g. Countries, Other, Oceans, World, Dependencies, etc.
16
+ :region, ## e.g. Europe, Oceans, etc.
17
+ )
18
+
19
+ def self.from_csv( path )
20
+ ###
21
+ # note:
22
+ # if you use quotes - NO leading spaces allowed e.g.
23
+ # use au,"Austria",... and NOT
24
+ # au, "Austria", ...
25
+ #
26
+ # for headers - NO leading spaces allowed e.g.
27
+ # use Code,Name,Category,Region,... and NOT
28
+ # Code, Name, Category, Region, ...
29
+
30
+ rows = CsvHash.read( path )
31
+
32
+ pp rows
33
+
34
+ recs = []
35
+ rows.each do |row|
36
+ pp row
37
+ rec = Code.new
38
+ rec.code = row['Code'].strip ## remove leading n trailing whitespaces
39
+ rec.name = row['Name'].strip
40
+
41
+ ## note: for now category and region are optional
42
+ rec.category = row['Category'].strip if row['Category'] && row['Category'].size > 0
43
+ rec.region = row['Region'].strip if row['Region'] && row['Region'].size > 0
44
+
45
+ pp rec
46
+ recs << rec
47
+ end
48
+
49
+ self.new( recs )
50
+ end
51
+
52
+ def initialize( codes )
53
+ @codes = codes
54
+ end
55
+
56
+ def size() @codes.size; end
57
+
58
+ def each( &blk ) @codes.each( &blk ); end
59
+ def select( &blk )
60
+ codes = @codes.select( &blk )
61
+ Codes.new( codes ) ## return (again) new Codes obj for easy-chaining - why? why not?
62
+ end
63
+
64
+
65
+ def to_a
66
+ @codes.collect {|code| code.code } ## return array of codes
67
+ end
68
+
69
+ ## def all() self.to_a; end ## note: alias for to_a - use - why? why not??
70
+
71
+ ## "pre-defined" convenience shortcuts
72
+ def countries() category 'Countries'; end
73
+ def world() category 'World'; end
74
+ def oceans() category 'Oceans'; end
75
+ def misc() category 'Miscellaneous'; end
76
+ def others() category 'Other'; end
77
+ def dependencies() category 'Dependencies'; end
78
+ def dependencies_us() category 'Dependencies (United States)'; end
79
+ ## fix/todo: add all dependencies uk (or gb?), fr,cn,au,nz,no,dk,etc.
80
+
81
+ def europe() region 'Europe'; end
82
+ def south_asia() region 'South Asia'; end
83
+ def central_asia() region 'Central Asia'; end
84
+ def east_n_souteast_asia() region 'East & Southeast Asia'; end
85
+ def middle_east() region 'Middle East'; end
86
+ def africa() region 'Africa'; end
87
+ def north_america() region 'North America'; end
88
+ def central_america_n_caribbean() region 'Central America and Caribbean'; end
89
+ def south_america() region 'South America'; end
90
+ def australia_oceania() region 'Australia-Oceania'; end
91
+ def antartica() region 'Antarctica'; end
92
+
93
+ ## note: regions oceans and world - same as category oceans and world
94
+ ## use oceans_ii or world_ii or something ??
95
+ ## use category('World') n region('World')
96
+ ## use category('Oceans') n region('Oceans')
97
+
98
+
99
+ def category( query )
100
+ ## todo/future: allow passing in of regex too (not just string)
101
+ ## note: e.g. Dependencies (France) needs to get escpaed to
102
+ ## Dependencies \(France\) etc.
103
+ filter_regex = /#{Regexp.escape(query)}/i
104
+ codes = @codes.select do |code|
105
+ code.category ? filter_regex.match( code.category ) : false ## note: allow nil for category; will fail on search
106
+ end
107
+ Codes.new( codes ) ## return new Codes obj for easy-chaining
108
+ end
109
+
110
+ def region( query )
111
+ ## todo/future: allow passing in of regex too (not just string)
112
+ filter_regex = /#{Regexp.escape(query)}/i
113
+ codes = @codes.select do |code|
114
+ code.region ? filter_regex.match( code.region ) : false ## note: allow nil for region; will fail on search
115
+ end
116
+ Codes.new( codes ) ## return new Codes obj for easy-chaining
117
+ end
118
+
119
+ end # class codes
120
+
121
+ end # module Factbook
122
+
@@ -0,0 +1,50 @@
1
+ # encoding: utf-8
2
+
3
+ module Factbook
4
+
5
+ class Comparisons
6
+
7
+ Comparison = Struct.new( :num, ### todo: use no or id or something - why? why not?
8
+ :category, ## e.g. Geography, People, Economy, etc.
9
+ :name,
10
+ )
11
+
12
+ def self.from_csv( path )
13
+
14
+ rows = CsvHash.read( path )
15
+
16
+ pp rows
17
+
18
+ recs = []
19
+ rows.each do |row|
20
+ pp row
21
+ rec = Comparison.new
22
+ rec.num = row['Num'].strip.to_i ## remove leading n trailing whitespaces
23
+ rec.category = row['Category'].strip
24
+ rec.name = row['Name'].strip
25
+
26
+ pp rec
27
+ recs << rec
28
+ end
29
+
30
+ self.new( recs )
31
+ end
32
+
33
+ def initialize( comps )
34
+ @comps = comps
35
+ end
36
+
37
+ def size() @comps.size; end
38
+
39
+ def each
40
+ @comps.each {|comp| yield( comp ) }
41
+ end
42
+
43
+ def to_a
44
+ @comps.collect {|comp| comp.num } ## return array of nums -- return something else - why? why not?
45
+ end
46
+
47
+ end # class Comparison
48
+
49
+ end # module Factbook
50
+
@@ -0,0 +1,48 @@
1
+ # encoding: utf-8
2
+
3
+ module Factbook
4
+
5
+ class Counter
6
+
7
+ attr_reader :data
8
+
9
+ def initialize
10
+ @data = {}
11
+ end
12
+
13
+ def count( page )
14
+
15
+ ## walk page data hash
16
+ # add nodes to data
17
+
18
+ walk( page, page.data, @data )
19
+ end
20
+
21
+
22
+ private
23
+ def walk( page, hin, hout )
24
+ hin.each do |k,v|
25
+ if v.is_a? Hash
26
+ hout2 = hout[k] || { count: 0, codes: '' }
27
+
28
+ hout2[ :count ] += 1
29
+
30
+ ## delete codes if larger (treshhold) than x (e.g. 9)
31
+ hout2.delete( :codes ) if hout2[ :count ] > 9
32
+
33
+ codes = hout2[ :codes ]
34
+ if codes ## note: might got deleted if passed treshhold (e.g. 9 entries)
35
+ codes << ' ' unless codes.empty? ## add separator (space for now)
36
+ codes << page.info.country_code
37
+ hout2[ :codes ] = codes
38
+ end
39
+
40
+ hout[k] = hout2
41
+ walk( page, v, hout2 )
42
+ end
43
+ end
44
+ end
45
+
46
+ end # class Counter
47
+
48
+ end # module Factbook
@@ -0,0 +1,43 @@
1
+ # encoding: utf-8
2
+
3
+ module Factbook
4
+ module NormalizeHelper
5
+
6
+
7
+ def normalize_category( text )
8
+
9
+ ## note: fix typos/errors with double colons e.g. note:: (instead of note:)
10
+
11
+ text = text.strip
12
+ text = text.sub( /:+\z/, '' ) # remove trailing : if present -- note: allow (fix) note:: too, thus, use :+
13
+ text = text.strip
14
+
15
+ #######################################
16
+ ### special cases
17
+
18
+ ## typos e.g ntoe => use note
19
+ text = 'note' if text == 'ntoe'
20
+ text = 'investment in fixed capital' if text == 'investment if fixed capital'
21
+
22
+ ## downcase
23
+ text = 'lowest point' if text == 'Lowest point'
24
+ text = 'chief of state' if text == 'Chief of state'
25
+
26
+ ## spelling variant (use more popular one)
27
+ text = 'signed, but not ratified' if text == 'signed but not ratified'
28
+ text = 'vectorborne diseases' if text == 'vectorborne disease'
29
+ text = 'water contact disease' if text == 'water contact diseases'
30
+ text = 'food or waterborne diseases' if text == 'food or waterborne disease'
31
+ text = 'geographic coordinates' if text == 'geographical coordinates'
32
+ text = 'note' if text == 'notes'
33
+ text = 'refugees (country of origin)' if text == 'refugees (countries of origin)'
34
+
35
+ ## border countries (8): -- remove (x) counter
36
+ text = 'border countries' if text.start_with?( 'border countries')
37
+
38
+ text
39
+ end
40
+
41
+
42
+ end # module NormalizeHelper
43
+ end # module Factbook
@@ -0,0 +1,148 @@
1
+
2
+ module Factbook
3
+
4
+
5
+ ## note:
6
+ ## some factbook pages with chrome (headers, footers, etc.)
7
+ ## are NOT valid utf-8, thus,
8
+ ## treat page as is (e.g. ASCII8BIT)
9
+ #
10
+ # only convert to utf8 when header and footer got stripped
11
+
12
+ ##
13
+ ## be/benin:
14
+ ## Key Force or FC [Lazare S?xx?HOU?xx?TO] -- two invalid byte code chars in Political parties and leaders:
15
+ #
16
+ ## in Western/Windows-1252 leads to FC [Lazare SÈHOUÉTO];
17
+ # Lazare Sèhouéto
18
+ #
19
+ # looks good - use (assume) Windows-1252 ????
20
+
21
+ ##
22
+ # check for is ascii 7-bit ??? if yes -noworries
23
+ # if not, log number of chars not using ascii 7-bit
24
+
25
+
26
+
27
+ class Page
28
+ include LogUtils::Logging
29
+
30
+ attr_reader :sects ## "structured" access e.g. sects/subsects/etc.
31
+ attr_reader :info ## meta info e.g. country_code, country_name, region_name, last_updated, etc.
32
+ attr_reader :data ## "plain" access with vanilla hash
33
+
34
+
35
+ ## standard version (note: requires https)
36
+ SITE_BASE = 'https://www.cia.gov/library/publications/the-world-factbook/geos/{code}.html'
37
+
38
+ def initialize( code, opts={} )
39
+ ### keep code - why? why not?? (use page_info/info e.g. info.country_code??)
40
+
41
+ if opts[:json]
42
+ json = opts[:json] ## note: json is (still) a string/text (NOT yet parsed to structured data)
43
+ b = JsonBuilder.from_string( json )
44
+ else ## assume html
45
+ if opts[:html] ## note: expects ASCII-7BIT/BINARY encoding
46
+ ## for debugging and testing allow "custom" passed-in html page
47
+ html = opts[:html]
48
+ else
49
+ url_string = SITE_BASE.gsub( '{code}', code )
50
+ ## note: expects ASCII-7BIT/BINARY encoding
51
+
52
+ ## html = fetch_page( url_string ) ## use PageFetcher class - why?? why not??
53
+ html = Webcache.read( url_string )
54
+ end
55
+ b = Builder.from_string( html )
56
+ end
57
+
58
+ @sects = b.sects
59
+ @info = b.info
60
+
61
+ ## todo/fix/quick hack:
62
+ ## check for info opts hash entry - lets you overwrite page info
63
+ ## -- use proper header to setup page info - why, why not??
64
+ if opts[:info]
65
+ info = opts[:info]
66
+ @info = info
67
+ end
68
+
69
+ @data = {}
70
+ @sects.each do |sect|
71
+ @data[ sect.title ] = sect.data
72
+ end
73
+
74
+ self ## return self (check - not needed??)
75
+ end
76
+
77
+
78
+ def to_json( opts={} ) ## convenience helper for data.to_json; note: pretty print by default!
79
+ if opts[:minify]
80
+ data.to_json
81
+ else
82
+ ## was: -- opts[:pretty] || opts[:pp]
83
+ JSON.pretty_generate( data ) ## note: pretty print by default!
84
+ end
85
+ end
86
+
87
+
88
+ def [](key) ### convenience shortcut
89
+ # lets you use
90
+ # page['geo']
91
+ # instead of
92
+ # page.data['geo']
93
+
94
+ ## fix: use delegate data, [] from forwardable lib - why?? why not??
95
+
96
+ data[key]
97
+ end
98
+
99
+ ## add convenience (shortcut) accessors / attributes / fields / getters
100
+
101
+ ATTRIBUTES.each do |attrib|
102
+ ## e.g.
103
+ ## def background() data['Introduction']['Background']['text']; end
104
+ ## def location() data['Geography']['Location']['text']; end
105
+ ## etc.
106
+ if attrib.path.size == 1
107
+ define_method attrib.name.to_sym do
108
+ @data.fetch( attrib.category, {} ).
109
+ fetch( attrib.path[0], {} )['text']
110
+ end
111
+ else ## assume size 2 for now
112
+ define_method attrib.name.to_sym do
113
+ @data.fetch( attrib.category, {} ).
114
+ fetch( attrib.path[0], {} ).
115
+ fetch( attrib.path[1], {} )['text']
116
+ end
117
+ end
118
+ end
119
+
120
+
121
+ private
122
+ def fetch_page( url )
123
+ response = Webget.page( url )
124
+
125
+ ## note: exit on get / fetch error - do NOT continue for now - why? why not?
126
+ exit 1 if response.status.nok? ## e.g. HTTP status code != 200
127
+
128
+
129
+ response.text
130
+ end
131
+
132
+
133
+ =begin
134
+ def self.from_url( cc, cn )
135
+ html_ascii = PageFetcher.new.fetch( cc )
136
+ self.new( cc, cn, html_ascii )
137
+ end
138
+
139
+ def self.from_file( cc, cn, opts={} )
140
+ input_dir = opts[:input_dir] || '.'
141
+ html_ascii = File.read( "#{input_dir}/#{cc}.html" ) ## fix/todo: use ASCII8BIT/binary reader
142
+ self.new( cc, cn, html_ascii )
143
+ end
144
+ =end
145
+
146
+
147
+ end # class Page
148
+ end # module Factbook