factbook 2.0.0 → 2.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (68) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +1 -1
  3. data/Manifest.txt +0 -61
  4. data/README.md +8 -506
  5. data/Rakefile +4 -9
  6. data/lib/factbook.rb +4 -64
  7. metadata +6 -124
  8. data/data/attributes.yml +0 -337
  9. data/data/categories.csv +0 -164
  10. data/data/codes.csv +0 -262
  11. data/data/codesxref.csv +0 -280
  12. data/data/comparisons.csv +0 -75
  13. data/lib/factbook/almanac.rb +0 -72
  14. data/lib/factbook/attributes.rb +0 -74
  15. data/lib/factbook/builder.rb +0 -212
  16. data/lib/factbook/builder_item.rb +0 -126
  17. data/lib/factbook/builder_json.rb +0 -79
  18. data/lib/factbook/codes.rb +0 -119
  19. data/lib/factbook/comparisons.rb +0 -50
  20. data/lib/factbook/counter.rb +0 -48
  21. data/lib/factbook/db/importer.rb +0 -92
  22. data/lib/factbook/db/models.rb +0 -11
  23. data/lib/factbook/db/schema.rb +0 -36
  24. data/lib/factbook/normalize.rb +0 -43
  25. data/lib/factbook/page.rb +0 -148
  26. data/lib/factbook/page_info.rb +0 -12
  27. data/lib/factbook/reader_json.rb +0 -51
  28. data/lib/factbook/sanitizer.rb +0 -178
  29. data/lib/factbook/sect.rb +0 -29
  30. data/lib/factbook/subsect.rb +0 -18
  31. data/lib/factbook/table.rb +0 -52
  32. data/lib/factbook/utils.rb +0 -85
  33. data/lib/factbook/utils_info.rb +0 -129
  34. data/lib/factbook/version.rb +0 -21
  35. data/script/almanac.rb +0 -48
  36. data/script/attributes.rb +0 -34
  37. data/script/build.rb +0 -28
  38. data/script/counter.rb +0 -145
  39. data/script/json.rb +0 -19
  40. data/script/testbr.rb +0 -33
  41. data/script/testcodes.rb +0 -11
  42. data/test/data/au.html +0 -579
  43. data/test/data/au.yml +0 -8
  44. data/test/data/be.html +0 -596
  45. data/test/data/be.yml +0 -8
  46. data/test/data/json/au.json +0 -892
  47. data/test/data/src/ag.html +0 -716
  48. data/test/data/src/au-2015-09-24.html +0 -2006
  49. data/test/data/src/au.html +0 -658
  50. data/test/data/src/be-2015-09-24.html +0 -2011
  51. data/test/data/src/be.html +0 -648
  52. data/test/helper.rb +0 -11
  53. data/test/test_attribs.rb +0 -87
  54. data/test/test_attribs_def.rb +0 -20
  55. data/test/test_builder.rb +0 -35
  56. data/test/test_codes.rb +0 -76
  57. data/test/test_comparisons.rb +0 -19
  58. data/test/test_convert.rb +0 -30
  59. data/test/test_counter.rb +0 -31
  60. data/test/test_fields.rb +0 -52
  61. data/test/test_importer.rb +0 -56
  62. data/test/test_item_builder.rb +0 -99
  63. data/test/test_json.rb +0 -45
  64. data/test/test_json_builder.rb +0 -25
  65. data/test/test_normalize.rb +0 -23
  66. data/test/test_page.rb +0 -38
  67. data/test/test_sanitizer.rb +0 -39
  68. data/test/test_sanitizer_regex.rb +0 -89
@@ -1,148 +0,0 @@
1
-
2
- module Factbook
3
-
4
-
5
- ## note:
6
- ## some factbook pages with chrome (headers, footers, etc.)
7
- ## are NOT valid utf-8, thus,
8
- ## treat page as is (e.g. ASCII8BIT)
9
- #
10
- # only convert to utf8 when header and footer got stripped
11
-
12
- ##
13
- ## be/benin:
14
- ## Key Force or FC [Lazare S?xx?HOU?xx?TO] -- two invalid byte code chars in Political parties and leaders:
15
- #
16
- ## in Western/Windows-1252 leads to FC [Lazare SÈHOUÉTO];
17
- # Lazare Sèhouéto
18
- #
19
- # looks good - use (assume) Windows-1252 ????
20
-
21
- ##
22
- # check for is ascii 7-bit ??? if yes -noworries
23
- # if not, log number of chars not using ascii 7-bit
24
-
25
-
26
-
27
- class Page
28
- include LogUtils::Logging
29
-
30
- attr_reader :sects ## "structured" access e.g. sects/subsects/etc.
31
- attr_reader :info ## meta info e.g. country_code, country_name, region_name, last_updated, etc.
32
- attr_reader :data ## "plain" access with vanilla hash
33
-
34
-
35
- ## standard version (note: requires https)
36
- SITE_BASE = 'https://www.cia.gov/library/publications/the-world-factbook/geos/{code}.html'
37
-
38
- def initialize( code, opts={} )
39
- ### keep code - why? why not?? (use page_info/info e.g. info.country_code??)
40
-
41
- if opts[:json]
42
- json = opts[:json] ## note: json is (still) a string/text (NOT yet parsed to structured data)
43
- b = JsonBuilder.from_string( json )
44
- else ## assume html
45
- if opts[:html] ## note: expects ASCII-7BIT/BINARY encoding
46
- ## for debugging and testing allow "custom" passed-in html page
47
- html = opts[:html]
48
- else
49
- url_string = SITE_BASE.gsub( '{code}', code )
50
- ## note: expects ASCII-7BIT/BINARY encoding
51
-
52
- ## html = fetch_page( url_string ) ## use PageFetcher class - why?? why not??
53
- html = Webcache.read( url_string )
54
- end
55
- b = Builder.from_string( html )
56
- end
57
-
58
- @sects = b.sects
59
- @info = b.info
60
-
61
- ## todo/fix/quick hack:
62
- ## check for info opts hash entry - lets you overwrite page info
63
- ## -- use proper header to setup page info - why, why not??
64
- if opts[:info]
65
- info = opts[:info]
66
- @info = info
67
- end
68
-
69
- @data = {}
70
- @sects.each do |sect|
71
- @data[ sect.title ] = sect.data
72
- end
73
-
74
- self ## return self (check - not needed??)
75
- end
76
-
77
-
78
- def to_json( opts={} ) ## convenience helper for data.to_json; note: pretty print by default!
79
- if opts[:minify]
80
- data.to_json
81
- else
82
- ## was: -- opts[:pretty] || opts[:pp]
83
- JSON.pretty_generate( data ) ## note: pretty print by default!
84
- end
85
- end
86
-
87
-
88
- def [](key) ### convenience shortcut
89
- # lets you use
90
- # page['geo']
91
- # instead of
92
- # page.data['geo']
93
-
94
- ## fix: use delegate data, [] from forwardable lib - why?? why not??
95
-
96
- data[key]
97
- end
98
-
99
- ## add convenience (shortcut) accessors / attributes / fields / getters
100
-
101
- ATTRIBUTES.each do |attrib|
102
- ## e.g.
103
- ## def background() data['Introduction']['Background']['text']; end
104
- ## def location() data['Geography']['Location']['text']; end
105
- ## etc.
106
- if attrib.path.size == 1
107
- define_method attrib.name.to_sym do
108
- @data.fetch( attrib.category, {} ).
109
- fetch( attrib.path[0], {} )['text']
110
- end
111
- else ## assume size 2 for now
112
- define_method attrib.name.to_sym do
113
- @data.fetch( attrib.category, {} ).
114
- fetch( attrib.path[0], {} ).
115
- fetch( attrib.path[1], {} )['text']
116
- end
117
- end
118
- end
119
-
120
-
121
- private
122
- def fetch_page( url )
123
- response = Webget.page( url )
124
-
125
- ## note: exit on get / fetch error - do NOT continue for now - why? why not?
126
- exit 1 if response.status.nok? ## e.g. HTTP status code != 200
127
-
128
-
129
- response.text
130
- end
131
-
132
-
133
- =begin
134
- def self.from_url( cc, cn )
135
- html_ascii = PageFetcher.new.fetch( cc )
136
- self.new( cc, cn, html_ascii )
137
- end
138
-
139
- def self.from_file( cc, cn, opts={} )
140
- input_dir = opts[:input_dir] || '.'
141
- html_ascii = File.read( "#{input_dir}/#{cc}.html" ) ## fix/todo: use ASCII8BIT/binary reader
142
- self.new( cc, cn, html_ascii )
143
- end
144
- =end
145
-
146
-
147
- end # class Page
148
- end # module Factbook
@@ -1,12 +0,0 @@
1
- # encoding: utf-8
2
-
3
- module Factbook
4
-
5
- PageInfo = Struct.new( :country_code,
6
- :country_name,
7
- :country_affiliation,
8
- :region_code,
9
- :region_name,
10
- :last_updated )
11
-
12
- end # module Factbook
@@ -1,51 +0,0 @@
1
- # encoding: utf-8
2
-
3
- module Factbook
4
-
5
-
6
- class JsonPageReader
7
- def initialize( json_dir )
8
- @json_dir = json_dir
9
- end
10
-
11
- def read_page( code )
12
- path = "#{@json_dir}/#{region_to_slug(code.region)}/#{code.code}.json"
13
-
14
- puts "reading #{code.code} #{code.name} (#{code.region}) [#{path}]..."
15
- json = File.read( path )
16
-
17
- ## todo/fix/quick hack: for now until we have a proper header/meta/info section in json
18
- # add some page info from code struct
19
-
20
- info = PageInfo.new
21
- info.country_code = code.code
22
- info.country_name = code.name
23
- info.region_name = code.region
24
-
25
- page = Page.new( code.code, json: json, info: info )
26
- page
27
- end
28
-
29
- def read_pages( codes, limit: nil )
30
- pages = []
31
- i=0
32
- codes.each do |code|
33
- next if limit && i > limit ## for debugging just process first x entries
34
-
35
- pages << read_page( code )
36
- end
37
- pages
38
- end
39
-
40
- private
41
- def region_to_slug( text )
42
- ## change and => n
43
- ## change & => n
44
- ## change all spaces to => -
45
- ## e.g. East & Southeast Asia => east-n-southeast-asia
46
- ## Central America and Caribbean => central-america-n-caribbean
47
- text.downcase.gsub('and', 'n').gsub( '&', 'n' ).gsub( ' ', '-' )
48
- end
49
- end ## JsonPageReader
50
-
51
- end # module Factbook
@@ -1,178 +0,0 @@
1
-
2
- module Factbook
3
-
4
- class Sanitizer
5
- include LogUtils::Logging
6
- include Utils ## pulls in encode_utf8, ...
7
-
8
-
9
- def sanitize( html_ascii )
10
- ## todo: add option for (html source) encoding - why?? why not??
11
-
12
- ## note:
13
- ## returns 1) html profile withouth headers, footers, scripts,etc.
14
- ## 2) page (meta) info e.g. country_name, country_code, last_updated, etc.
15
- ## 3) errors e.g. list of errors e.g. endcoding errors (invalid byte sequence etc.)
16
-
17
- page_info = PageInfo.new
18
-
19
- ## todo:
20
- ## make page info optional? why? why not?
21
- ## not always available (if page structure changes) - check
22
- ## what page info is required??
23
- h = find_page_info( html_ascii )
24
- if h
25
- page_info.country_code = h[:country_code]
26
- page_info.country_name = h[:country_name]
27
- page_info.country_affiliation = h[:country_affiliation]
28
- page_info.region_code = h[:region_code]
29
- page_info.region_name = h[:region_name]
30
- else
31
- page_info.country_code = find_country_code( html_ascii )
32
- ## print/warn: no page info found
33
- end
34
-
35
-
36
- page_info.last_updated = find_page_last_updated( html_ascii )
37
-
38
-
39
- html = find_country_profile( html_ascii ) ## cut-off headers, footers, scripts, etc.
40
-
41
- ## todo/fix: assume windows 12xx encoding!!!! for factbook - try
42
- # html, errors = encode_utf8( html_profile_ascii ) ## change encoding to utf-8 (from binary/ascii8bit)
43
-
44
- # html = sanitize_profile( html )
45
-
46
- [html, page_info, []]
47
- end
48
-
49
-
50
- #
51
- # <span class="subfield-date" aria-label="Date of information: 2018">(2018)</span>
52
- #
53
- # remove aria labels
54
- ARIA_ATTR_REGEX = /\s*
55
- aria-label=('|").+?\1 ## note: use non-greedy match e.g. .+?
56
- /xim ## do NOT allow multi-line - why? why not?
57
-
58
-
59
- def find_country_profile( html )
60
- ####
61
- ## remove header (everything before)
62
- ## <ul class="expandcollapse">
63
-
64
- doc = Nokogiri::HTML( html )
65
-
66
- ul = doc.css( 'ul.expandcollapse' )[0]
67
-
68
- puts ul.to_html[0..100]
69
-
70
-
71
-
72
- ## note: special case cc uses h2 instead of div block
73
- ## <h2 class="question cam_med" sectiontitle="Introduction" ccode="cc"
74
- ## style="border-bottom: 2px solid white; cursor: pointer;">
75
- ## Introduction :: <span class="region">CURACAO </span>
76
- ## </h2>
77
- ## is old format !!!!
78
- ## cc - CURACAO
79
- ## http headers says - last-modified: Wed, 14 Nov 2018 14:09:28 GMT
80
- ## page says - PAGE LAST UPDATED ON MARCH 14, 2018
81
- ## wait for new version to be generated / pushed!!!
82
-
83
- ## check for old format if h2 are present
84
- h2s = ul.css( 'h2' )
85
- if h2s.size > 0
86
- puts " !! WARN: found #{h2s.size} h2(s) - assume old format - sorry - must wait for update!!!"
87
- ## return empty html string - why? why not?
88
- return ''
89
- end
90
-
91
-
92
- ###
93
- ## sanitize
94
-
95
- ## remove link items
96
- ## assume two <li>s are a section
97
-
98
- html = String.new('')
99
-
100
- ## filter all li's
101
- ul_children = ul.children.select { |el| if el.name == 'li'
102
- true
103
- else
104
- # puts "skipping #{el.name} >#{el.to_html}<"
105
- false
106
- end
107
- }
108
- puts " #{ul_children.size} li(s):"
109
- ul_children.each_slice(2) do |lis|
110
- li = lis[0]
111
- div = li.at( 'div[sectiontitle]' )
112
- if div.nil?
113
- puts "!! ERROR: no section title found in div:"
114
- puts li.to_html
115
- exit 1
116
- end
117
-
118
- section_title = div['sectiontitle'].to_s
119
-
120
- html << "<h2>#{section_title}</h2>\n"
121
-
122
-
123
- li = lis[1]
124
- ## filter all div's
125
- li_children = li.children.select { |el| if el.name =='div'
126
- true
127
- else
128
- # puts "skipping #{el.name} >#{el.to_html}<"
129
- false
130
- end
131
- }
132
- puts " #{li_children.size} div(s):"
133
-
134
- li_children.each_slice(2) do |divs|
135
- div = divs[0]
136
- a = div.css('a')[0]
137
-
138
- if a
139
- html << "\n<h3>#{a.text}:</h3>\n"
140
- else
141
- puts "!! WARN: no anchor found:"
142
- puts div.to_html
143
- end
144
-
145
-
146
- div = divs[1]
147
- div_children = div.children.select {|el| el.name == 'div' ? true : false }
148
- div_children.each do |catdiv|
149
- if catdiv['class'] && catdiv['class'].index( 'category_data' )
150
-
151
- if catdiv['class'].index( 'attachment' )
152
- ## skip attachments e.g. maps, pop pyramids, etc.
153
- else
154
- html << catdiv.to_html
155
- html << "\n"
156
- end
157
- else
158
- puts "!! WARN: skipping div (W/O category_data class):"
159
- puts catdiv.to_html
160
- end
161
- end
162
- end
163
- end
164
-
165
-
166
- html = html.gsub( ARIA_ATTR_REGEX ) do |m|
167
- puts "remove aria-label attr:"
168
- puts "#{m}"
169
- ''
170
- end
171
-
172
- html
173
- end
174
-
175
-
176
- end # class Sanitizer
177
-
178
- end # module Factbook
@@ -1,29 +0,0 @@
1
- # encoding: utf-8
2
-
3
- module Factbook
4
-
5
-
6
- class Sect
7
- include LogUtils::Logging
8
-
9
- attr_accessor :title ## use name instead of title - why? why not?
10
- attr_accessor :subsects
11
-
12
- def initialize
13
- @subsects = []
14
- end
15
-
16
- def data
17
- ## convert sects to hash
18
- @data = {}
19
-
20
- subsects.each_with_index do |subsect,i|
21
- @data[ subsect.title ] = subsect.data
22
- end
23
- @data
24
- end
25
-
26
-
27
- end # class Sect
28
-
29
- end # module Factbook
@@ -1,18 +0,0 @@
1
- # encoding: utf-8
2
-
3
- module Factbook
4
-
5
-
6
- class Subsect
7
- include LogUtils::Logging
8
-
9
- attr_accessor :title ## use name instead of title - why? why not?
10
- attr_accessor :data ## hash holding data e.g. { 'text' => '...' etc. }
11
-
12
- def initialize
13
- @data = {}
14
- end
15
-
16
- end # class Subsect
17
-
18
- end # module Factbook
@@ -1,52 +0,0 @@
1
- # encoding: utf-8
2
-
3
- module Factbook
4
-
5
- ##
6
- ## make more "generic" - why? why not?
7
- ## (re)use for other files ?? move to textutils ??
8
-
9
- ##
10
- ## for now reads in rows with values separated by at least 3+ spaces e.g.:
11
- ## see www.cia.gov/library/publications/the-world-factbook/rankorder/rawdata_2119.txt
12
- ## 1 China 1,367,485,388
13
- ## 2 India 1,251,695,584
14
- ## 3 European Union 513,949,445
15
- ## 4 United States 321,368,864
16
- ## 5 Indonesia 255,993,674
17
- ## 6 Brazil 204,259,812
18
-
19
-
20
- class TableReader
21
- include LogUtils::Logging
22
-
23
-
24
- def initialize( text )
25
- @text = text
26
- end
27
-
28
- def read
29
- recs = []
30
-
31
- line_no = 0
32
- @text.each_line do |line|
33
- line_no +=1
34
- line = line.strip ## remove leading and trailing whitespace
35
- if line.empty?
36
- puts "** skipping empty line #{line_no}"
37
- next
38
- end
39
-
40
- values = line.split( /[ ]{3,}/ ) ## split three or more spaces - use just two ?? why? why not??
41
-
42
- ## puts line
43
- ## pp values
44
- recs << values
45
- end
46
- recs
47
- end
48
-
49
-
50
- end # class TableReader
51
-
52
- end # module Factbook