factbook 0.1.3 → 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (46) hide show
  1. checksums.yaml +7 -0
  2. data/Manifest.txt +34 -22
  3. data/README.md +8 -3
  4. data/Rakefile +2 -263
  5. data/data/codes.csv +262 -0
  6. data/data/comparisons.csv +75 -0
  7. data/lib/factbook/builder.rb +214 -0
  8. data/lib/factbook/builder_item.rb +93 -0
  9. data/lib/factbook/codes.rb +119 -0
  10. data/lib/factbook/comparisons.rb +50 -0
  11. data/lib/factbook/page.rb +103 -303
  12. data/lib/factbook/sanitizer.rb +214 -0
  13. data/lib/factbook/sect.rb +29 -196
  14. data/lib/factbook/subsect.rb +18 -0
  15. data/lib/factbook/table.rb +52 -0
  16. data/lib/factbook/utils.rb +85 -0
  17. data/lib/factbook/utils_info.rb +102 -0
  18. data/lib/factbook/version.rb +4 -3
  19. data/lib/factbook.rb +23 -1
  20. data/test/data/au.html +579 -0
  21. data/test/data/au.yml +8 -0
  22. data/test/data/be.html +596 -0
  23. data/test/data/be.yml +8 -0
  24. data/test/data/src/au.html +2006 -0
  25. data/test/data/src/be.html +2011 -0
  26. data/test/helper.rb +0 -4
  27. data/test/test_builder.rb +37 -0
  28. data/test/test_codes.rb +76 -0
  29. data/test/test_comparisons.rb +19 -0
  30. data/test/test_fields.rb +21 -18
  31. data/test/test_item_builder.rb +99 -0
  32. data/test/test_json.rb +17 -20
  33. data/test/test_page.rb +18 -10
  34. data/test/test_sanitizer.rb +35 -0
  35. metadata +68 -49
  36. data/.gemtest +0 -0
  37. data/test/data/countrytemplate_au.html +0 -4179
  38. data/test/data/countrytemplate_be.html +0 -4260
  39. data/test/data/countrytemplate_br.html +0 -4366
  40. data/test/data/countrytemplate_ee.html +0 -2999
  41. data/test/data/countrytemplate_ls.html +0 -2728
  42. data/test/data/countrytemplate_mx.html +0 -4397
  43. data/test/data/countrytemplate_vt.html +0 -1726
  44. data/test/data/countrytemplate_xx.html +0 -2898
  45. data/test/test_page_old.rb +0 -478
  46. data/test/test_strip.rb +0 -66
data/lib/factbook/sect.rb CHANGED
@@ -1,196 +1,29 @@
1
- # encoding: utf-8
2
-
3
- module Factbook
4
-
5
- class Sect # section (e.g. Introduction/Geography/People/Economy/Energy/Transport/etc.)
6
- include LogUtils::Logging
7
-
8
- attr_reader :title, :html
9
-
10
- def initialize( title, html, opts={} )
11
- ## todo: passing a ref to the parent page - why? why not??
12
- @title = title
13
- @html = html
14
- @opts = opts # fields: full|long|keep|std|??? -- find a good name for the option keeping field names as is
15
-
16
- @doc = nil
17
- @data = nil
18
- end
19
-
20
- def doc
21
- ### check: use nokogiri html fragment? why? why not??
22
- @doc ||= Nokogiri::HTML( @html )
23
- end
24
-
25
- def data
26
- @data ||= sect_to_hash( doc )
27
- end
28
-
29
- private
30
-
31
- def cleanup_key( key )
32
-
33
- if @opts[:fields] # if set assume full|long|keep for now
34
- ### kepe field names as is
35
- ## e.g.
36
- ## GDP - composition, by sector of origin:
37
- ## Budget surplus (+) or deficit (-):
38
- ## becomes:
39
- ## GDP - composition, by sector of origin
40
- ## Budget surplus (+) or deficit (-)
41
- key = key.strip
42
- key = key.gsub( /[ ]{2,}/, ' ' ) # fold two plus spaces into one -- check if exists?
43
- key = key.gsub( /:\z/, '' ) # remove trailing : if present
44
- key = key.strip
45
- else
46
- ## to lower case
47
- key = key.downcase
48
- ## seaport(s) => seaports
49
- key = key.gsub( '(s)', 's' )
50
- key = key.gsub( ':', '' ) # trailing : ## fix: use regex /:$/ w/ anchor??
51
- ## remove special chars ()+-/,'
52
- key = key.gsub( /['()+\-\/,]/, ' ' )
53
- key = key.strip
54
- key = key.gsub( /[ ]+/, '_' )
55
- end
56
-
57
- key
58
- end
59
-
60
-
61
- def sect_to_hash( sect )
62
-
63
- rows = sect.css( 'table tr' )
64
- cells = sect.css( 'table tr td' )
65
- field_ids = rows.css( '#field' ) ## check - use div#field.category -- possible?
66
- data_ids = rows.css( '#data' )
67
-
68
- logger.debug "rows.size: #{rows.size} (cells.size: #{cells.size} / field_ids.size: #{field_ids.size} / data_ids.size: #{data_ids.size})"
69
-
70
- hash = {}
71
- last_cat = nil
72
-
73
- cells.each_with_index do |cell,i|
74
- ## next if i > 14 ## skip after xx for debugging for now
75
-
76
- # check if field or data id
77
- # check for (nested) div#field in td
78
- has_field_id = cell.css( '#field' ).size == 1 ? true : false
79
-
80
- # check for td#data
81
- has_data_id = cell['id'] == 'data' ? true : false
82
-
83
- if has_field_id
84
-
85
- cats = cell.css( 'div.category' ) ## note: ignore all .category not using div (issue warn/err if found!!) etc.
86
- if cats.size == 1
87
- text = cleanup_key( cats.first.text.strip ) # remove/strip leading and trailing spaces
88
- last_cat = text
89
- logger.debug " [#{i}] category: >>#{text}<<"
90
- else
91
- logger.warn "**** !!!!!! warn/err - found element w/ field id (no match for subsection!!! - check)"
92
- logger.warn cell.to_s
93
- end
94
-
95
- elsif has_data_id
96
-
97
- cats = cell.css( 'div.category' ) ## note: ignore all .category not using div (issue warn/err if found!!) etc.
98
- cats_data = cell.css( 'div.category_data,span.category_data' ) ## note: ignore a.category_data etc.
99
- cats_div_data = cell.css( 'div.category_data' )
100
- cats_span_data = cell.css( 'span.category_data' )
101
-
102
- logger.debug " - [#{i}] data cell - cats: #{cats.size}, cats_data: #{cats_data.size} (cats_div_data: #{cats_div_data.size} / cats_span_data: #{cats_span_data.size})"
103
-
104
- pairs = []
105
- last_pair = nil
106
- last_pair_data_count = 0
107
-
108
- ## loop over div blocks (might be .category or .category_data)
109
- cell.children.each_with_index do |child,j|
110
- unless child.element?
111
- ## puts " **** !!!! skipping non-element type >#{child.type}<:"
112
- ## puts child.to_s
113
- next
114
- end
115
- unless child.name == 'div'
116
- logger.warn " **** !!! skipping non-div >#{child.name}<:"
117
- logger.warn child.to_s
118
- next
119
- end
120
-
121
- ### check if .category or .category_data
122
- if child['class'] == 'category'
123
-
124
- ## collect text for category; exclude element w/ class.category_data
125
- text = ""
126
- child.children.each do |subchild|
127
- text << subchild.text.strip unless subchild.element? && subchild['class'] == 'category_data'
128
- end
129
- text = cleanup_key( text )
130
-
131
- value = child.css('span.category_data').text.strip
132
-
133
- logger.debug " -- category >>#{text}<<"
134
-
135
- ## start new pair
136
- last_pair = [ text, value ]
137
- last_pair_data_count = 0
138
- pairs << last_pair
139
-
140
- elsif child['class'] == 'category_data'
141
- logger.debug " -- category_data"
142
-
143
- text = child.text.strip
144
-
145
- if last_pair.nil?
146
- ## assume its the very first entry; use implied/auto-created category
147
- last_pair = [ 'text', '' ]
148
- last_pair_data_count = 0
149
- pairs << last_pair
150
- end
151
-
152
- ### first category_data element?
153
- if last_pair_data_count == 0
154
- if last_pair[1] == ''
155
- last_pair[1] = text
156
- else
157
- last_pair[1] += " #{text}" ## append w/o separator
158
- end
159
- else
160
- if last_cat == 'demographic_profile' || last_cat == 'Demographic profile' ## special case (use space a sep)
161
- last_pair[1] += " #{text}" ## append with separator
162
- else
163
- last_pair[1] += "; #{text}" ## append with separator
164
- end
165
- end
166
- last_pair_data_count += 1
167
-
168
- else
169
- logger.warn " **** !!! skipping div w/o category or category_data class:"
170
- logger.warn child.to_s
171
- end
172
- end
173
-
174
- ## pp pairs
175
-
176
- ## pairs to hash
177
- pairs_hash = {}
178
- pairs.each do |pair|
179
- pairs_hash[ pair[0] ] = pair[1]
180
- end
181
-
182
- hash[ last_cat ] = pairs_hash
183
-
184
- else
185
- logger.warn "#### !!!! unknown cell type (no field or data id found):"
186
- logger.warn cell.to_s
187
- end
188
- end # each cell
189
-
190
- hash # return hash
191
-
192
- end # method sect_to_hash
193
-
194
- end # class Sect
195
-
196
- end # module Factbook
1
+ # encoding: utf-8
2
+
3
+ module Factbook
4
+
5
+
6
+ class Sect
7
+ include LogUtils::Logging
8
+
9
+ attr_accessor :title ## use name instead of title - why? why not?
10
+ attr_accessor :subsects
11
+
12
+ def initialize
13
+ @subsects = []
14
+ end
15
+
16
+ def data
17
+ ## convert sects to hash
18
+ @data = {}
19
+
20
+ subsects.each_with_index do |subsect,i|
21
+ @data[ subsect.title ] = subsect.data
22
+ end
23
+ @data
24
+ end
25
+
26
+
27
+ end # class Sect
28
+
29
+ end # module Factbook
@@ -0,0 +1,18 @@
1
+ # encoding: utf-8
2
+
3
+ module Factbook
4
+
5
+
6
+ class Subsect
7
+ include LogUtils::Logging
8
+
9
+ attr_accessor :title ## use name instead of title - why? why not?
10
+ attr_accessor :data ## hash holding data e.g. { 'text' => '...' etc. }
11
+
12
+ def initialize
13
+ @data = {}
14
+ end
15
+
16
+ end # class Subsect
17
+
18
+ end # module Factbook
@@ -0,0 +1,52 @@
1
+ # encoding: utf-8
2
+
3
+ module Factbook
4
+
5
+ ##
6
+ ## make more "generic" - why? why not?
7
+ ## (re)use for other files ?? move to textutils ??
8
+
9
+ ##
10
+ ## for now reads in rows with values separated by at least 3+ spaces e.g.:
11
+ ## see www.cia.gov/library/publications/the-world-factbook/rankorder/rawdata_2119.txt
12
+ ## 1 China 1,367,485,388
13
+ ## 2 India 1,251,695,584
14
+ ## 3 European Union 513,949,445
15
+ ## 4 United States 321,368,864
16
+ ## 5 Indonesia 255,993,674
17
+ ## 6 Brazil 204,259,812
18
+
19
+
20
+ class TableReader
21
+ include LogUtils::Logging
22
+
23
+
24
+ def initialize( text )
25
+ @text = text
26
+ end
27
+
28
+ def read
29
+ recs = []
30
+
31
+ line_no = 0
32
+ @text.each_line do |line|
33
+ line_no +=1
34
+ line = line.strip ## remove leading and trailing whitespace
35
+ if line.empty?
36
+ puts "** skipping empty line #{line_no}"
37
+ next
38
+ end
39
+
40
+ values = line.split( /[ ]{3,}/ ) ## split three or more spaces - use just two ?? why? why not??
41
+
42
+ ## puts line
43
+ ## pp values
44
+ recs << values
45
+ end
46
+ recs
47
+ end
48
+
49
+
50
+ end # class TableReader
51
+
52
+ end # module Factbook
@@ -0,0 +1,85 @@
1
+ # encoding: utf-8
2
+
3
+ module Factbook
4
+ module Utils
5
+
6
+ ########################################
7
+ ## todo: move to textutils - why, why not ?????
8
+
9
+ def encode_utf8( text )
10
+
11
+ errors = [] ## also return list of encoding errors
12
+
13
+ ## note: factbook claims utf-8 - but includes invalid bytes in some pages
14
+ ## encoding is likley wester/windows-
15
+
16
+ ## note:
17
+ ## use � - unknown/invalid unicode char
18
+ ## fix/todo: use ASCII-8BIT instead of binnary
19
+ text = text.encode('UTF-8', 'binary', :invalid => :replace,
20
+ :undef => :replace,
21
+ :replace => '�' )
22
+
23
+ ## check for replaced/invalid chars and log warrning
24
+ pos = text.index( '�' )
25
+ while pos
26
+ from = pos-10 ## tood/fix: use min/max to check for bounds - why? why not??
27
+ to = pos+10
28
+ around = text[from..to]
29
+ puts " pos #{pos}, from #{from}, to #{to}, around >#{around}<"
30
+ msg = "invalid char on pos #{pos} around: >#{around}<"
31
+ puts msg
32
+ ## also log message / w timestamp
33
+
34
+ errors << "#{Time.now} - #{msg}"
35
+
36
+ pos = text.index( '�', pos+1 )
37
+ end
38
+
39
+ [text,errors] ## return text and errors (list)
40
+ end
41
+
42
+
43
+
44
+ def values_to_csv( values )
45
+ buf = ""
46
+ values.each_with_index do |value,i|
47
+ buf << ',' if i > 0 ## add comma (except for first value)
48
+ ## note: allow optional $ sign e.g. $100,000,000
49
+ ## !!!! todo/fix: allow optional minus e.g. -44,000
50
+ if value =~ /^\$?[1-9][,0-9]+[0-9]$/ ### find a better regex - why? why not??
51
+ ## check if number e.g. 17,098,242 or $17,098,242
52
+ ## remove commas 17098242
53
+ buf << value.gsub( ',', '' )
54
+ elsif value.index( ',').nil?
55
+ ## add as is 1:1 (no commana)
56
+ buf << value
57
+ else
58
+ ## escape comma with double quote
59
+ # e.g. Guam, The becomes "Guam, The"
60
+ buf << '"'
61
+ buf << value
62
+ buf << '"'
63
+ end
64
+ end
65
+ buf
66
+ end
67
+
68
+
69
+ def data_to_csv( recs, headers )
70
+ text = ""
71
+
72
+ text << values_to_csv( headers )
73
+ text << "\n"
74
+
75
+ recs.each do |rec|
76
+ text << values_to_csv( rec )
77
+ text << "\n"
78
+ end
79
+
80
+ text
81
+ end
82
+
83
+
84
+ end # module Utils
85
+ end # module Factbook
@@ -0,0 +1,102 @@
1
+ # encoding: utf-8
2
+
3
+ module Factbook
4
+ module Utils
5
+
6
+ #######
7
+ ## find meta data (about page info)
8
+
9
+
10
+ #### e.g. Page last updated on September 16, 2015
11
+
12
+ MONTH_EN_TO_S={
13
+ 'January' => '1',
14
+ 'February' => '2',
15
+ 'March' => '3',
16
+ 'April' => '4',
17
+ 'May' => '5',
18
+ 'June' => '6',
19
+ 'July' => '7',
20
+ 'August' => '8',
21
+ 'September' => '9',
22
+ 'October' => '10',
23
+ 'November' => '11',
24
+ 'December' => '12'
25
+ }
26
+
27
+ PAGE_LAST_UPDATED_REGEX = /
28
+ Page \s last \s updated \s on \s
29
+ (?<month_en>[a-z]+) \s
30
+ (?<day>\d{1,2}), \s
31
+ (?<year>\d{4})
32
+ /imx
33
+
34
+ def find_page_last_updated( html )
35
+ m = PAGE_LAST_UPDATED_REGEX.match( html )
36
+ if m
37
+ pp m
38
+ month_en = m[:month_en]
39
+ day = m[:day]
40
+ year = m[:year]
41
+ puts "** bingo - month #{month_en}, day #{day}, year #{year}"
42
+
43
+ month = MONTH_EN_TO_S[ month_en ]
44
+ date_str = "#{year}-#{month}-#{day}"
45
+ pp date_str
46
+ date = Date.strptime( date_str, '%Y-%m-%d' )
47
+ date
48
+ else
49
+ nil
50
+ end
51
+ end
52
+
53
+ ##
54
+ ## e.g. regioncode="eur"
55
+ ## countrycode="au"
56
+ ## countryname="Austria"
57
+ ## flagsubfield=""
58
+ ## countryaffiliation=""
59
+ ## flagdescription=""
60
+ ## flagdescriptionnote=""
61
+ ## region="Europe"
62
+ ##
63
+ ## note: countryaffiliation may be empty
64
+
65
+
66
+
67
+ PAGE_INFO_REGEX = /
68
+ regioncode=(?<q1>"|')(?<region_code>.+?)\k<q1>
69
+ \s+
70
+ countrycode=(?<q2>"|')(?<country_code>.+?)\k<q2> ## is k<3> backref
71
+ \s+
72
+ countryname=(?<q3>"|')(?<country>.+?)\k<q3>
73
+ \s+
74
+ [^>]+? ## allow any attribs (note: non-greedy)
75
+ countryaffiliation=(?<q4>"|')(?<affiliation>.*?)\k<q4> ## note: might be empty
76
+ \s+
77
+ [^>]+? ## allow any attribs (note: non-greedy)
78
+ region=(?<q5>"|')(?<region>.+?)\k<q5> ## check world - might be empty ?? or for ocean ??
79
+ /imx
80
+
81
+
82
+ def find_page_info( html )
83
+ m = PAGE_INFO_REGEX.match( html )
84
+ if m
85
+ pp m
86
+
87
+ h = { country_code: m[:country_code],
88
+ country_name: m[:country],
89
+ country_affiliation: m[:affiliation],
90
+ region_code: m[:region_code],
91
+ region_name: m[:region] }
92
+
93
+ puts "** bingo - #{h.inspect}"
94
+ h ## return hash w/ name-value pairs
95
+ else
96
+ nil ## or return empty struct with nils/empty strings - why?? why not??
97
+ end
98
+ end
99
+
100
+
101
+ end # module Utils
102
+ end # module Factbook
@@ -1,9 +1,10 @@
1
+ # encoding: utf-8
1
2
 
2
3
  module Factbook
3
4
 
4
- MAJOR = 0
5
- MINOR = 1
6
- PATCH = 3
5
+ MAJOR = 1
6
+ MINOR = 0
7
+ PATCH = 0
7
8
  VERSION = [MAJOR,MINOR,PATCH].join('.')
8
9
 
9
10
  def self.version
data/lib/factbook.rb CHANGED
@@ -7,6 +7,7 @@ require 'uri'
7
7
  require 'cgi'
8
8
  require 'pp'
9
9
  require 'json'
10
+ require 'csv'
10
11
  require 'fileutils'
11
12
 
12
13
 
@@ -21,11 +22,32 @@ require 'nokogiri'
21
22
  # our own code
22
23
 
23
24
  require 'factbook/version' # let it always go first
25
+ require 'factbook/utils'
26
+ require 'factbook/utils_info'
27
+ require 'factbook/sanitizer'
28
+ require 'factbook/builder_item'
29
+ require 'factbook/builder'
24
30
  require 'factbook/page'
25
31
  require 'factbook/sect'
32
+ require 'factbook/subsect'
26
33
 
34
+ require 'factbook/codes'
35
+ require 'factbook/comparisons'
27
36
 
37
+ require 'factbook/table' ## e.g. TableReader
28
38
 
29
39
 
30
- puts Factbook.banner
31
40
 
41
+ module Factbook
42
+
43
+ ## auto-load builtin codes and comparisons
44
+ CODES = Codes.from_csv( "#{Factbook.root}/data/codes.csv" )
45
+ COMPARISONS = Comparisons.from_csv( "#{Factbook.root}/data/comparisons.csv")
46
+
47
+ def self.codes() CODES; end
48
+ def self.comparisons() COMPARISONS; end
49
+
50
+ end # module Factbook
51
+
52
+
53
+ puts Factbook.banner if defined?($RUBYLIBS_DEBUG) && $RUBYLIBS_DEBUG