quandl_babelfish 0.0.6 → 0.0.7

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,238 +1,238 @@
1
- module Quandl
2
- module Babelfish
3
-
4
- #responsible for number formatting
5
- class DateMaid
6
- @defaults = {
7
- :format => nil
8
- }
9
-
10
- @settings = @defaults #init with defaults
11
-
12
- class << self
13
-
14
- def init(user_settings)
15
- @settings=@defaults.merge(user_settings)
16
- end
17
-
18
- #looks at all the dates and formats them to unambiguous ISO 8601 format (yyyy-mm-dd)
19
- def sweep(all_dates)
20
- return nil if all_dates.nil?
21
-
22
- all_dates = disinfect all_dates
23
-
24
- if @settings[:format].nil?
25
- #find good example and extract all info from it and apply it to each of the dates in the set
26
- good_sample = find_good_date(all_dates)
27
-
28
- raise( Error::GuessDateFormat.new, "Unable to find date format for provided dates" ) if good_sample.nil?
29
-
30
- date_format, frequency = analyze_date_format(good_sample)
31
-
32
-
33
- else
34
- date_format = @settings[:format]
35
- end
36
-
37
- iso_dates=[]
38
- all_dates.each_with_index do |fuzzy_date, i|
39
- temp_date = convert(fuzzy_date, date_format) rescue raise( Error::InvalidDate.new( line: i+1, row: fuzzy_date, context: 'convert' ), "Invalid date '#{fuzzy_date}'" )
40
- iso_dates << frequency_transform(temp_date, frequency)
41
- end
42
-
43
- iso_dates
44
- end
45
-
46
- def analyze_date_format(example)
47
- return nil if example.nil?
48
-
49
- # Regular formats and Custom formats (where Date.parse and Date.strptime
50
- # fear to tread)
51
- if re = example.match(/^(\d{1,2})\D(\d{1,2})\D\d{4}/) # eg "07/03/2012"
52
- if re[1].to_i > 12
53
- return '%d-%m-%Y', nil
54
- else
55
- return '%m-%d-%Y', nil
56
- end
57
- end
58
- if re = example.match(/^(\d{1,2})\D(\d{1,2})\D\d{2}/) # eg "07/03/12"
59
- if re[1].to_i > 12
60
- return '%d-%m-%y', nil
61
- else
62
- return '%m-%d-%y', nil
63
- end
64
- end
65
- # order these guys from most specific to most general
66
- return "%Y", "annual" if example =~ /^\d{4}[\s]?-[\s]?\d{4}$/
67
- return '%Y%m%d', 'daily' if example =~ /^\d{8}$/ && example[4..5].to_i < 13 && example[6..7].to_i < 32 # precisely 8 digits - yyyymmdd
68
- return 'epoch', 'daily' if example =~ /^\d{7}.*$/ # 7 or more digits - epoch
69
- return '%Y', 'annual' if example =~ /^\d{4}$/ # 4 digits
70
- return '%Y', 'annual' if example =~ /^\d{4}\.0$/ # 4 digits with a dot 0 for excel
71
- return ':year_quarter', 'quarterly' if example =~ /^\d{4}[Qq]\d$/ # 4 digits, Q, digit (here because the next pattern would override it)
72
- return '%YM%m', 'monthly' if example =~ /^\d{4}M\d{1,2}$/ # 2007M08
73
- return '%GW%V', 'weekly' if example =~ /^\d{4}W\d{1,2}$/ # 2012W01
74
- return '%Y-%m', 'monthly' if example =~ /^\d{4}\D\d{1,2}$/ # 4 digits, separator, 1-2 digits
75
- return '%m-%Y', 'monthly' if example =~ /^\d{1,2}\D\d{4}$/ # 1-2 digits, separator, 4 digits
76
- return '%Y%m', 'monthly' if example =~ /^\d{6}$/ # 6 digits
77
- return '%Y-%b', 'monthly' if example =~ /^\d{4}\D\w{3}$/ # 4 digits, separator, 3 letters
78
- return '%b-%Y', 'monthly' if example =~ /^\w{3}\D\d{4}$/ # 3 letters, separator, 4 digits
79
- return '%b-%y', 'monthly' if example =~ /^\w{3}\D\d{2}$/ # 3 letters, separator, 2 digits
80
- return '%Y%b', 'monthly' if example =~ /^\d{4}\w{3}$/ # 4 digits, 3 letters
81
- return '%b%Y', 'monthly' if example =~ /^\w{3}\d{4}$/ # 3 letters, 4 digits
82
- return '%Y-%b-%d', 'daily' if example =~ /^\d{4}\D\w{3}\D\d{1,2}$/ # 4 digits, separator, 3 letters, separator, 1-2 digits
83
- return '%Y-%m-%d', 'daily' if example =~ /^\d{4}\D\d{1,2}\D\d{1,2}$/ # 4 digits, separator, 1-2 digits, separator, 1-2 digits
84
- return '%d-%b-%Y', 'daily' if example =~ /^\d{1,2}\D\w{3}\D\d{4}$/ # 1-2 digits, separator, 3 letters, separator, 4 digits
85
- return '%Y%b%d', 'daily' if example =~ /^\d{4}\w{3}\d{1,2}$/ # 4 digits, 3 letters, 1-2 digits
86
- return '%d%b%Y', 'daily' if example =~ /^\d{1,2}\w{3}\d{4}$/ # 1-2 digits, 3 letters, 4 digits
87
- return '%d-%b-%y', 'daily' if example =~ /^\d{1,2}\D\w{3}\D\d{2}$/ # 1-2 digits, 3 letters, 2 digits
88
- return '%b-%d-%Y', 'daily' if example =~ /^\w{3}\D\d{1,2}\D{1,2}\d{4}$/ # 3 letters, separator, 1-2 digits, separator(s), 4 digits
89
-
90
- #our custom formats
91
- return ':year_quarter', 'quarterly' if example =~ /^\d{4}\D[Qq]\d$/ # 4 digits, separator, Q, digit
92
- return ':excel-1900', 'daily' if example =~ /^\d{5}$/ # 5 digits
93
- return ':excel-1900', 'daily' if example =~ /^\d{5}\.0$/ # 5 digits dot zero excel
94
-
95
- # No, try default date parse
96
- # raise PostProcessorException, "Unable to guess date format for #{example}"
97
- [nil, nil]
98
- end
99
-
100
- def disinfect(dates)
101
- [*dates].collect do |date|
102
- date.to_s.encode!('UTF-8', 'UTF-8', :invalid => :replace)
103
- date.to_s.gsub!(/[^\x01-\x7f]/,'')
104
- date.to_s.strip.gsub(/\s\s+/, ' ')
105
- end
106
- end
107
- private
108
-
109
-
110
- #converts date to specified format
111
- def convert(fuzzy_date, date_format)
112
- if date_format.nil?
113
- # Assuming a US date format with 3 parameters (i.e. MM?DD?YYYY)
114
- tokens = fuzzy_date.split(/\D/)
115
- if tokens[0].length > 2 || fuzzy_date =~ /\w{2}/
116
- # Its ISO
117
- return DateTime.parse(fuzzy_date.to_s).to_date
118
- else
119
- # Guessing US
120
- return Date.new(tokens[2].to_i, tokens[0].to_i, tokens[1].to_i)
121
- end
122
- else
123
- case date_format
124
- when ':year_quarter'
125
- return year_quarter_formatter(fuzzy_date)
126
- when ':excel-1900'
127
- return excel_1900_formatter(fuzzy_date)
128
- else #regular ruby formatter
129
- return regular_formatter(fuzzy_date, date_format)
130
- end
131
-
132
- end
133
- end
134
-
135
-
136
- def year_quarter_formatter(fuzzy_date)
137
- raw_date = fuzzy_date
138
- tokens = raw_date.gsub(/[qQ]/, '-').gsub(/[a-zA-Z]/, '').split(/[^0-9]/)
139
- tokens.delete_if {|x| x.nil? || x.empty?} # In case there are more than one delimiter because we replaced the Q
140
- Date.new(tokens[0].to_i, tokens[1].to_i * 3, 1)
141
- end
142
-
143
- def excel_1900_formatter(fuzzy_date)
144
- # handle Lotus 123 bug has 1900 as a leap year
145
- Date.civil(1899, 12, 31) + fuzzy_date.to_i - 1 if fuzzy_date.to_i > 0
146
- end
147
-
148
- def regular_formatter(fuzzy_date, date_format)
149
- # We have a date format - oh so pretty, but...
150
- date_string = fuzzy_date
151
- # normalize delimiters to hyphens so we do not have to make a format for each one.
152
- # delimiters can be letters when its all numbers and delimiters only when there are letters. Sigh.
153
- # only if no format where provided
154
- date_string = date_string.gsub(/[^\d\w]+/, '-') if @settings[:format].nil?
155
-
156
- #epoch date string
157
- if date_format == 'epoch'
158
- news = Time.at(date_string.to_i).utc.to_s.match(/\d\d\d\d-\d\d-\d\d/)
159
- formatted_date = DateTime.strptime(news.to_s, '%Y-%m-%d').to_date
160
- else
161
- if date_string.to_s =~ /^(\w{3})\D(\d{2})$/
162
- century = $2.to_i < 25 ? '20' : '19'
163
- date_string = "#{$1} #{century}#{$2}"
164
- formatted_date = DateTime.strptime(date_string.to_s, '%b %Y').to_date
165
- else
166
- formatted_date = DateTime.strptime(date_string.to_s, date_format).to_date
167
- end
168
- end
169
- formatted_date+=4 if date_format == '%GW%V' #strptime makes dates on Mondays. We want Fridays.
170
- formatted_date
171
- end
172
-
173
-
174
-
175
- #find good example of date to use as template for format
176
- def find_good_date(all_dates)
177
- good_sample=nil
178
- all_dates.each do |fuzzy_date|
179
- if usable_cell(fuzzy_date)
180
- good_sample = fuzzy_date
181
- break
182
- end
183
- end
184
- good_sample
185
- end
186
-
187
- def usable_cell(cell)
188
- return false if cell.nil? || cell.to_s.empty?
189
- return false if cell.to_s.size > 20 # even annotated date can't be bigger than 20
190
-
191
- return true if cell.to_s =~ /^\w{3}\D[456789]\d$/
192
- # date is not usable as an example if it is ambiguous as to day and month
193
- # 03/04/2012, for example, is ambiguous. 03/17/2012 is NOT ambiguous
194
- if re = cell.to_s.match(/^(\d{1,2})\D(\d{1,2})\D\d{2,4}/) # e.g. 03/04/2012
195
- if re[1].to_i <= 12 and re[2].to_i <= 12
196
- return false
197
- else
198
- return true
199
- end
200
- end
201
-
202
- if re = cell.to_s.match(/^(\d{1,2})\D\w{3}\D(\d{2})/) # 07-jun-07
203
- if re[1].to_i <= 12 and re[2].to_i <= 12
204
- return false
205
- else
206
- return true
207
- end
208
- end
209
-
210
- return true if cell.to_s =~ /\d{4}/ # It has a 4 digit year somewhere
211
-
212
- return true if cell.to_s =~ /^\w{3}\D\d{2}/ # %b-%y(d)..also not ambiguous
213
-
214
- false # Thank you, come again
215
- end
216
-
217
- # Bump date to the end of the respective periods
218
- def frequency_transform(date, frequency)
219
- case frequency
220
- when 'annual'
221
- date = Date.new(date.year,12,31)
222
- when 'quarterly'
223
- month = 3*((date.month-1)/3 + 1) # equals 3,6,9 or 12
224
- date = Date.new(date.year, month, 1).next_month-1
225
- when 'monthly'
226
- date = Date.new(date.year, date.month,1).next_month-1
227
- else
228
- # Do nothing for daily or weekly
229
- end
230
-
231
- date
232
- end
233
-
234
-
235
- end
236
- end
237
- end
1
+ module Quandl
2
+ module Babelfish
3
+
4
+ #responsible for number formatting
5
+ class DateMaid
6
+ @defaults = {
7
+ :format => nil
8
+ }
9
+
10
+ @settings = @defaults #init with defaults
11
+
12
+ class << self
13
+
14
+ def init(user_settings)
15
+ @settings=@defaults.merge(user_settings)
16
+ end
17
+
18
+ #looks at all the dates and formats them to unambiguous ISO 8601 format (yyyy-mm-dd)
19
+ def sweep(all_dates)
20
+ return nil if all_dates.nil?
21
+
22
+ all_dates = disinfect all_dates
23
+
24
+ if @settings[:format].nil?
25
+ #find good example and extract all info from it and apply it to each of the dates in the set
26
+ good_sample = find_good_date(all_dates)
27
+
28
+ raise( Error::GuessDateFormat.new, "Unable to find date format for provided dates" ) if good_sample.nil?
29
+
30
+ date_format, frequency = analyze_date_format(good_sample)
31
+
32
+
33
+ else
34
+ date_format = @settings[:format]
35
+ end
36
+
37
+ iso_dates=[]
38
+ all_dates.each_with_index do |fuzzy_date, i|
39
+ temp_date = convert(fuzzy_date, date_format) rescue raise( Error::InvalidDate.new( line: i+1, row: fuzzy_date, context: 'convert' ), "Invalid date '#{fuzzy_date}'" )
40
+ iso_dates << frequency_transform(temp_date, frequency)
41
+ end
42
+
43
+ iso_dates
44
+ end
45
+
46
+ def analyze_date_format(example)
47
+ return nil if example.nil?
48
+
49
+ # Regular formats and Custom formats (where Date.parse and Date.strptime
50
+ # fear to tread)
51
+ if re = example.match(/^(\d{1,2})\D(\d{1,2})\D\d{4}/) # eg "07/03/2012"
52
+ if re[1].to_i > 12
53
+ return '%d-%m-%Y', nil
54
+ else
55
+ return '%m-%d-%Y', nil
56
+ end
57
+ end
58
+ if re = example.match(/^(\d{1,2})\D(\d{1,2})\D\d{2}/) # eg "07/03/12"
59
+ if re[1].to_i > 12
60
+ return '%d-%m-%y', nil
61
+ else
62
+ return '%m-%d-%y', nil
63
+ end
64
+ end
65
+ # order these guys from most specific to most general
66
+ return "%Y", "annual" if example =~ /^\d{4}[\s]?-[\s]?\d{4}$/
67
+ return '%Y%m%d', 'daily' if example =~ /^\d{8}$/ && example[4..5].to_i < 13 && example[6..7].to_i < 32 # precisely 8 digits - yyyymmdd
68
+ return 'epoch', 'daily' if example =~ /^\d{7}.*$/ # 7 or more digits - epoch
69
+ return '%Y', 'annual' if example =~ /^\d{4}$/ # 4 digits
70
+ return '%Y', 'annual' if example =~ /^\d{4}\.0$/ # 4 digits with a dot 0 for excel
71
+ return ':year_quarter', 'quarterly' if example =~ /^\d{4}[Qq]\d$/ # 4 digits, Q, digit (here because the next pattern would override it)
72
+ return '%YM%m', 'monthly' if example =~ /^\d{4}M\d{1,2}$/ # 2007M08
73
+ return '%GW%V', 'weekly' if example =~ /^\d{4}W\d{1,2}$/ # 2012W01
74
+ return '%Y-%m', 'monthly' if example =~ /^\d{4}\D\d{1,2}$/ # 4 digits, separator, 1-2 digits
75
+ return '%m-%Y', 'monthly' if example =~ /^\d{1,2}\D\d{4}$/ # 1-2 digits, separator, 4 digits
76
+ return '%Y%m', 'monthly' if example =~ /^\d{6}$/ # 6 digits
77
+ return '%Y-%b', 'monthly' if example =~ /^\d{4}\D\w{3}$/ # 4 digits, separator, 3 letters
78
+ return '%b-%Y', 'monthly' if example =~ /^\w{3}\D\d{4}$/ # 3 letters, separator, 4 digits
79
+ return '%b-%y', 'monthly' if example =~ /^\w{3}\D\d{2}$/ # 3 letters, separator, 2 digits
80
+ return '%Y%b', 'monthly' if example =~ /^\d{4}\w{3}$/ # 4 digits, 3 letters
81
+ return '%b%Y', 'monthly' if example =~ /^\w{3}\d{4}$/ # 3 letters, 4 digits
82
+ return '%Y-%b-%d', 'daily' if example =~ /^\d{4}\D\w{3}\D\d{1,2}$/ # 4 digits, separator, 3 letters, separator, 1-2 digits
83
+ return '%Y-%m-%d', 'daily' if example =~ /^\d{4}\D\d{1,2}\D\d{1,2}$/ # 4 digits, separator, 1-2 digits, separator, 1-2 digits
84
+ return '%d-%b-%Y', 'daily' if example =~ /^\d{1,2}\D\w{3}\D\d{4}$/ # 1-2 digits, separator, 3 letters, separator, 4 digits
85
+ return '%Y%b%d', 'daily' if example =~ /^\d{4}\w{3}\d{1,2}$/ # 4 digits, 3 letters, 1-2 digits
86
+ return '%d%b%Y', 'daily' if example =~ /^\d{1,2}\w{3}\d{4}$/ # 1-2 digits, 3 letters, 4 digits
87
+ return '%d-%b-%y', 'daily' if example =~ /^\d{1,2}\D\w{3}\D\d{2}$/ # 1-2 digits, 3 letters, 2 digits
88
+ return '%b-%d-%Y', 'daily' if example =~ /^\w{3}\D\d{1,2}\D{1,2}\d{4}$/ # 3 letters, separator, 1-2 digits, separator(s), 4 digits
89
+
90
+ #our custom formats
91
+ return ':year_quarter', 'quarterly' if example =~ /^\d{4}\D[Qq]\d$/ # 4 digits, separator, Q, digit
92
+ return ':excel-1900', 'daily' if example =~ /^\d{5}$/ # 5 digits
93
+ return ':excel-1900', 'daily' if example =~ /^\d{5}\.0$/ # 5 digits dot zero excel
94
+
95
+ # No, try default date parse
96
+ # raise PostProcessorException, "Unable to guess date format for #{example}"
97
+ [nil, nil]
98
+ end
99
+
100
+ def disinfect(dates)
101
+ [*dates].collect do |date|
102
+ date.to_s.encode!('UTF-8', 'UTF-8', :invalid => :replace)
103
+ date.to_s.gsub!(/[^\x01-\x7f]/,'')
104
+ date.to_s.strip.gsub(/\s\s+/, ' ')
105
+ end
106
+ end
107
+ private
108
+
109
+
110
+ #converts date to specified format
111
+ def convert(fuzzy_date, date_format)
112
+ if date_format.nil?
113
+ # Assuming a US date format with 3 parameters (i.e. MM?DD?YYYY)
114
+ tokens = fuzzy_date.split(/\D/)
115
+ if tokens[0].length > 2 || fuzzy_date =~ /\w{2}/
116
+ # Its ISO
117
+ return DateTime.parse(fuzzy_date.to_s).to_date
118
+ else
119
+ # Guessing US
120
+ return Date.new(tokens[2].to_i, tokens[0].to_i, tokens[1].to_i)
121
+ end
122
+ else
123
+ case date_format
124
+ when ':year_quarter'
125
+ return year_quarter_formatter(fuzzy_date)
126
+ when ':excel-1900'
127
+ return excel_1900_formatter(fuzzy_date)
128
+ else #regular ruby formatter
129
+ return regular_formatter(fuzzy_date, date_format)
130
+ end
131
+
132
+ end
133
+ end
134
+
135
+
136
+ def year_quarter_formatter(fuzzy_date)
137
+ raw_date = fuzzy_date
138
+ tokens = raw_date.gsub(/[qQ]/, '-').gsub(/[a-zA-Z]/, '').split(/[^0-9]/)
139
+ tokens.delete_if {|x| x.nil? || x.empty?} # In case there are more than one delimiter because we replaced the Q
140
+ Date.new(tokens[0].to_i, tokens[1].to_i * 3, 1)
141
+ end
142
+
143
+ def excel_1900_formatter(fuzzy_date)
144
+ # handle Lotus 123 bug has 1900 as a leap year
145
+ Date.civil(1899, 12, 31) + fuzzy_date.to_i - 1 if fuzzy_date.to_i > 0
146
+ end
147
+
148
+ def regular_formatter(fuzzy_date, date_format)
149
+ # We have a date format - oh so pretty, but...
150
+ date_string = fuzzy_date
151
+ # normalize delimiters to hyphens so we do not have to make a format for each one.
152
+ # delimiters can be letters when its all numbers and delimiters only when there are letters. Sigh.
153
+ # only if no format where provided
154
+ date_string = date_string.gsub(/[^\d\w]+/, '-') if @settings[:format].nil?
155
+
156
+ #epoch date string
157
+ if date_format == 'epoch'
158
+ news = Time.at(date_string.to_i).utc.to_s.match(/\d\d\d\d-\d\d-\d\d/)
159
+ formatted_date = DateTime.strptime(news.to_s, '%Y-%m-%d').to_date
160
+ else
161
+ if date_string.to_s =~ /^(\w{3})\D(\d{2})$/
162
+ century = $2.to_i < 25 ? '20' : '19'
163
+ date_string = "#{$1} #{century}#{$2}"
164
+ formatted_date = DateTime.strptime(date_string.to_s, '%b %Y').to_date
165
+ else
166
+ formatted_date = DateTime.strptime(date_string.to_s, date_format).to_date
167
+ end
168
+ end
169
+ formatted_date+=4 if date_format == '%GW%V' #strptime makes dates on Mondays. We want Fridays.
170
+ formatted_date
171
+ end
172
+
173
+
174
+
175
+ #find good example of date to use as template for format
176
+ def find_good_date(all_dates)
177
+ good_sample=nil
178
+ all_dates.each do |fuzzy_date|
179
+ if usable_cell(fuzzy_date)
180
+ good_sample = fuzzy_date
181
+ break
182
+ end
183
+ end
184
+ good_sample
185
+ end
186
+
187
+ def usable_cell(cell)
188
+ return false if cell.nil? || cell.to_s.empty?
189
+ return false if cell.to_s.size > 20 # even annotated date can't be bigger than 20
190
+
191
+ return true if cell.to_s =~ /^\w{3}\D[456789]\d$/
192
+ # date is not usable as an example if it is ambiguous as to day and month
193
+ # 03/04/2012, for example, is ambiguous. 03/17/2012 is NOT ambiguous
194
+ if re = cell.to_s.match(/^(\d{1,2})\D(\d{1,2})\D\d{2,4}/) # e.g. 03/04/2012
195
+ if re[1].to_i <= 12 and re[2].to_i <= 12
196
+ return false
197
+ else
198
+ return true
199
+ end
200
+ end
201
+
202
+ if re = cell.to_s.match(/^(\d{1,2})\D\w{3}\D(\d{2})/) # 07-jun-07
203
+ if re[1].to_i <= 12 and re[2].to_i <= 12
204
+ return false
205
+ else
206
+ return true
207
+ end
208
+ end
209
+
210
+ return true if cell.to_s =~ /\d{4}/ # It has a 4 digit year somewhere
211
+
212
+ return true if cell.to_s =~ /^\w{3}\D\d{2}/ # %b-%y(d)..also not ambiguous
213
+
214
+ false # Thank you, come again
215
+ end
216
+
217
+ # Bump date to the end of the respective periods
218
+ def frequency_transform(date, frequency)
219
+ case frequency
220
+ when 'annual'
221
+ date = Date.new(date.year,12,31)
222
+ when 'quarterly'
223
+ month = 3*((date.month-1)/3 + 1) # equals 3,6,9 or 12
224
+ date = Date.new(date.year, month, 1).next_month-1
225
+ when 'monthly'
226
+ date = Date.new(date.year, date.month,1).next_month-1
227
+ else
228
+ # Do nothing for daily or weekly
229
+ end
230
+
231
+ date
232
+ end
233
+
234
+
235
+ end
236
+ end
237
+ end
238
238
  end
@@ -0,0 +1,9 @@
1
+ class Helper
2
+
3
+ # Actions expect a square table, make it so
4
+ def self.make_square(table)
5
+ longest_row = 0
6
+ table.each { |row| longest_row = [longest_row, row.length].max }
7
+ table.collect { |row| row += Array.new(longest_row - row.length, nil) }
8
+ end
9
+ end
@@ -1,80 +1,80 @@
1
- module Quandl
2
- module Babelfish
3
-
4
- #responsible for number cleaning
5
- class NumberMaid
6
- @defaults = {
7
- :decimal_mark => Regexp.escape('.'),
8
- :ignore_brackets => false, # Brackets ARE negative by default
9
- }
10
-
11
- @settings = @defaults #init with defaults
12
-
13
- class << self
14
-
15
-
16
- def init(user_settings)
17
- @settings=@defaults.merge(user_settings)
18
- @escaped_decimal = Regexp.escape(@settings[:decimal_mark])
19
- end
20
-
21
- #cleans each number one by one
22
- def clean(dirty_numbers)
23
- return nil if dirty_numbers.nil?
24
- numbers=[]
25
- Array(dirty_numbers).each do |cell|
26
- numbers << cell_to_number(cell.to_s)
27
- end
28
-
29
- (numbers.size == 1) ? numbers[0] : numbers
30
- end
31
-
32
- def cell_to_number(num)
33
- return nil if num.nil?
34
- # Remove annotations
35
- # if there is something in parenthesis and a number elsewhere, nuke the parenthesis
36
- temp = num.gsub(/[\(\[\{].*[\)\}\]]/, '')
37
- num = temp if temp.match(/\d/)
38
-
39
- num.gsub!("est.", '')
40
-
41
- #check for exponents by searching for 'e' 'E' or variations of 'x 10' '*10' and 'X10^'
42
- is_exp = false
43
- expmultiplier = 1
44
- m = /(\s)*(E|e|[X|x|\*](\s)*10(\^)?)(\s)*/.match(num)
45
- #check if match is made, preceeded by a number/decimal, and succeeded by a digit or a plus/minus sign
46
- if !m.nil? and m.pre_match =~ /[0-9#{@escaped_decimal}]$/ and m.post_match =~ /^([\-+0-9])/
47
- is_exp = true
48
- num = m.pre_match
49
- expmultiplier = 10 ** /^[0-9\-+]*/.match(m.post_match)[0].to_i
50
- end
51
- is_million = (num =~ /million/i)
52
- is_billion = (num =~ /billion/i)
53
- is_negative = (num =~ /-[\d]/ or (!@settings[:ignore_brackets] and num =~ /\([\d]/))
54
-
55
- # watch out for two numbers, like a range eg "27.3 - 33.9"
56
- # how: if you a see a number followed by a non number char that is not the decimal marker, kill everything to the right of that
57
- num.gsub!(/(\d) (\d)/, '\1\2')
58
- if m = num.match(/-?\s*[,\d\.]+/)
59
- num = m[0]
60
- end
61
-
62
- # only keep #s and decimal mark
63
- num.gsub!(/[^0-9#{@escaped_decimal}]/, '')
64
- num.gsub!(/[^0-9]/, '.')
65
-
66
- return nil if num.nil? || num !~ /[\d]/
67
- return nil if num.end_with?(".")
68
- return nil if num.count(".") > 1
69
- cell = num.nil? ? 0.0 : Float("%.#{14}g" % num)
70
- cell *= 1e6 if is_million
71
- cell *= 1e9 if is_billion
72
- cell *= -1 if is_negative
73
- cell *= expmultiplier if is_exp
74
- cell
75
- end
76
-
77
- end
78
- end
79
- end
1
+ module Quandl
2
+ module Babelfish
3
+
4
+ #responsible for number cleaning
5
+ class NumberMaid
6
+ @defaults = {
7
+ :decimal_mark => Regexp.escape('.'),
8
+ :ignore_brackets => false, # Brackets ARE negative by default
9
+ }
10
+
11
+ @settings = @defaults #init with defaults
12
+
13
+ class << self
14
+
15
+
16
+ def init(user_settings)
17
+ @settings=@defaults.merge(user_settings)
18
+ @escaped_decimal = Regexp.escape(@settings[:decimal_mark])
19
+ end
20
+
21
+ #cleans each number one by one
22
+ def clean(dirty_numbers)
23
+ return nil if dirty_numbers.nil?
24
+ numbers=[]
25
+ Array(dirty_numbers).each do |cell|
26
+ numbers << cell_to_number(cell.to_s)
27
+ end
28
+
29
+ (numbers.size == 1) ? numbers[0] : numbers
30
+ end
31
+
32
+ def cell_to_number(num)
33
+ return nil if num.nil?
34
+ # Remove annotations
35
+ # if there is something in parenthesis and a number elsewhere, nuke the parenthesis
36
+ temp = num.gsub(/[\(\[\{].*[\)\}\]]/, '')
37
+ num = temp if temp.match(/\d/)
38
+
39
+ num.gsub!("est.", '')
40
+
41
+ #check for exponents by searching for 'e' 'E' or variations of 'x 10' '*10' and 'X10^'
42
+ is_exp = false
43
+ expmultiplier = 1
44
+ m = /(\s)*(E|e|[X|x|\*](\s)*10(\^)?)(\s)*/.match(num)
45
+ #check if match is made, preceeded by a number/decimal, and succeeded by a digit or a plus/minus sign
46
+ if !m.nil? and m.pre_match =~ /[0-9#{@escaped_decimal}]$/ and m.post_match =~ /^([\-+0-9])/
47
+ is_exp = true
48
+ num = m.pre_match
49
+ expmultiplier = 10 ** /^[0-9\-+]*/.match(m.post_match)[0].to_i
50
+ end
51
+ is_million = (num =~ /million/i)
52
+ is_billion = (num =~ /billion/i)
53
+ is_negative = (num =~ /-[\d]/ or (!@settings[:ignore_brackets] and num =~ /\([\d]/))
54
+
55
+ # watch out for two numbers, like a range eg "27.3 - 33.9"
56
+ # how: if you a see a number followed by a non number char that is not the decimal marker, kill everything to the right of that
57
+ num.gsub!(/(\d) (\d)/, '\1\2')
58
+ if m = num.match(/-?\s*[,\d\.]+/)
59
+ num = m[0]
60
+ end
61
+
62
+ # only keep #s and decimal mark
63
+ num.gsub!(/[^0-9#{@escaped_decimal}]/, '')
64
+ num.gsub!(/[^0-9]/, '.')
65
+
66
+ return nil if num.nil? || num !~ /[\d]/
67
+ return nil if num.end_with?(".")
68
+ return nil if num.count(".") > 1
69
+ cell = num.nil? ? 0.0 : Float("%.#{14}g" % num)
70
+ cell *= 1e6 if is_million
71
+ cell *= 1e9 if is_billion
72
+ cell *= -1 if is_negative
73
+ cell *= expmultiplier if is_exp
74
+ cell
75
+ end
76
+
77
+ end
78
+ end
79
+ end
80
80
  end
@@ -1,5 +1,5 @@
1
- module Quandl
2
- module Babelfish
3
- VERSION = '0.0.6'
4
- end
1
+ module Quandl
2
+ module Babelfish
3
+ VERSION = '0.0.7'
4
+ end
5
5
  end