quandl_babelfish 0.0.6 → 0.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,238 +1,238 @@
1
- module Quandl
2
- module Babelfish
3
-
4
- #responsible for number formatting
5
- class DateMaid
6
- @defaults = {
7
- :format => nil
8
- }
9
-
10
- @settings = @defaults #init with defaults
11
-
12
- class << self
13
-
14
- def init(user_settings)
15
- @settings=@defaults.merge(user_settings)
16
- end
17
-
18
- #looks at all the dates and formats them to unambiguous ISO 8601 format (yyyy-mm-dd)
19
- def sweep(all_dates)
20
- return nil if all_dates.nil?
21
-
22
- all_dates = disinfect all_dates
23
-
24
- if @settings[:format].nil?
25
- #find good example and extract all info from it and apply it to each of the dates in the set
26
- good_sample = find_good_date(all_dates)
27
-
28
- raise( Error::GuessDateFormat.new, "Unable to find date format for provided dates" ) if good_sample.nil?
29
-
30
- date_format, frequency = analyze_date_format(good_sample)
31
-
32
-
33
- else
34
- date_format = @settings[:format]
35
- end
36
-
37
- iso_dates=[]
38
- all_dates.each_with_index do |fuzzy_date, i|
39
- temp_date = convert(fuzzy_date, date_format) rescue raise( Error::InvalidDate.new( line: i+1, row: fuzzy_date, context: 'convert' ), "Invalid date '#{fuzzy_date}'" )
40
- iso_dates << frequency_transform(temp_date, frequency)
41
- end
42
-
43
- iso_dates
44
- end
45
-
46
- def analyze_date_format(example)
47
- return nil if example.nil?
48
-
49
- # Regular formats and Custom formats (where Date.parse and Date.strptime
50
- # fear to tread)
51
- if re = example.match(/^(\d{1,2})\D(\d{1,2})\D\d{4}/) # eg "07/03/2012"
52
- if re[1].to_i > 12
53
- return '%d-%m-%Y', nil
54
- else
55
- return '%m-%d-%Y', nil
56
- end
57
- end
58
- if re = example.match(/^(\d{1,2})\D(\d{1,2})\D\d{2}/) # eg "07/03/12"
59
- if re[1].to_i > 12
60
- return '%d-%m-%y', nil
61
- else
62
- return '%m-%d-%y', nil
63
- end
64
- end
65
- # order these guys from most specific to most general
66
- return "%Y", "annual" if example =~ /^\d{4}[\s]?-[\s]?\d{4}$/
67
- return '%Y%m%d', 'daily' if example =~ /^\d{8}$/ && example[4..5].to_i < 13 && example[6..7].to_i < 32 # precisely 8 digits - yyyymmdd
68
- return 'epoch', 'daily' if example =~ /^\d{7}.*$/ # 7 or more digits - epoch
69
- return '%Y', 'annual' if example =~ /^\d{4}$/ # 4 digits
70
- return '%Y', 'annual' if example =~ /^\d{4}\.0$/ # 4 digits with a dot 0 for excel
71
- return ':year_quarter', 'quarterly' if example =~ /^\d{4}[Qq]\d$/ # 4 digits, Q, digit (here because the next pattern would override it)
72
- return '%YM%m', 'monthly' if example =~ /^\d{4}M\d{1,2}$/ # 2007M08
73
- return '%GW%V', 'weekly' if example =~ /^\d{4}W\d{1,2}$/ # 2012W01
74
- return '%Y-%m', 'monthly' if example =~ /^\d{4}\D\d{1,2}$/ # 4 digits, separator, 1-2 digits
75
- return '%m-%Y', 'monthly' if example =~ /^\d{1,2}\D\d{4}$/ # 1-2 digits, separator, 4 digits
76
- return '%Y%m', 'monthly' if example =~ /^\d{6}$/ # 6 digits
77
- return '%Y-%b', 'monthly' if example =~ /^\d{4}\D\w{3}$/ # 4 digits, separator, 3 letters
78
- return '%b-%Y', 'monthly' if example =~ /^\w{3}\D\d{4}$/ # 3 letters, separator, 4 digits
79
- return '%b-%y', 'monthly' if example =~ /^\w{3}\D\d{2}$/ # 3 letters, separator, 2 digits
80
- return '%Y%b', 'monthly' if example =~ /^\d{4}\w{3}$/ # 4 digits, 3 letters
81
- return '%b%Y', 'monthly' if example =~ /^\w{3}\d{4}$/ # 3 letters, 4 digits
82
- return '%Y-%b-%d', 'daily' if example =~ /^\d{4}\D\w{3}\D\d{1,2}$/ # 4 digits, separator, 3 letters, separator, 1-2 digits
83
- return '%Y-%m-%d', 'daily' if example =~ /^\d{4}\D\d{1,2}\D\d{1,2}$/ # 4 digits, separator, 1-2 digits, separator, 1-2 digits
84
- return '%d-%b-%Y', 'daily' if example =~ /^\d{1,2}\D\w{3}\D\d{4}$/ # 1-2 digits, separator, 3 letters, separator, 4 digits
85
- return '%Y%b%d', 'daily' if example =~ /^\d{4}\w{3}\d{1,2}$/ # 4 digits, 3 letters, 1-2 digits
86
- return '%d%b%Y', 'daily' if example =~ /^\d{1,2}\w{3}\d{4}$/ # 1-2 digits, 3 letters, 4 digits
87
- return '%d-%b-%y', 'daily' if example =~ /^\d{1,2}\D\w{3}\D\d{2}$/ # 1-2 digits, 3 letters, 2 digits
88
- return '%b-%d-%Y', 'daily' if example =~ /^\w{3}\D\d{1,2}\D{1,2}\d{4}$/ # 3 letters, separator, 1-2 digits, separator(s), 4 digits
89
-
90
- #our custom formats
91
- return ':year_quarter', 'quarterly' if example =~ /^\d{4}\D[Qq]\d$/ # 4 digits, separator, Q, digit
92
- return ':excel-1900', 'daily' if example =~ /^\d{5}$/ # 5 digits
93
- return ':excel-1900', 'daily' if example =~ /^\d{5}\.0$/ # 5 digits dot zero excel
94
-
95
- # No, try default date parse
96
- # raise PostProcessorException, "Unable to guess date format for #{example}"
97
- [nil, nil]
98
- end
99
-
100
- def disinfect(dates)
101
- [*dates].collect do |date|
102
- date.to_s.encode!('UTF-8', 'UTF-8', :invalid => :replace)
103
- date.to_s.gsub!(/[^\x01-\x7f]/,'')
104
- date.to_s.strip.gsub(/\s\s+/, ' ')
105
- end
106
- end
107
- private
108
-
109
-
110
- #converts date to specified format
111
- def convert(fuzzy_date, date_format)
112
- if date_format.nil?
113
- # Assuming a US date format with 3 parameters (i.e. MM?DD?YYYY)
114
- tokens = fuzzy_date.split(/\D/)
115
- if tokens[0].length > 2 || fuzzy_date =~ /\w{2}/
116
- # Its ISO
117
- return DateTime.parse(fuzzy_date.to_s).to_date
118
- else
119
- # Guessing US
120
- return Date.new(tokens[2].to_i, tokens[0].to_i, tokens[1].to_i)
121
- end
122
- else
123
- case date_format
124
- when ':year_quarter'
125
- return year_quarter_formatter(fuzzy_date)
126
- when ':excel-1900'
127
- return excel_1900_formatter(fuzzy_date)
128
- else #regular ruby formatter
129
- return regular_formatter(fuzzy_date, date_format)
130
- end
131
-
132
- end
133
- end
134
-
135
-
136
- def year_quarter_formatter(fuzzy_date)
137
- raw_date = fuzzy_date
138
- tokens = raw_date.gsub(/[qQ]/, '-').gsub(/[a-zA-Z]/, '').split(/[^0-9]/)
139
- tokens.delete_if {|x| x.nil? || x.empty?} # In case there are more than one delimiter because we replaced the Q
140
- Date.new(tokens[0].to_i, tokens[1].to_i * 3, 1)
141
- end
142
-
143
- def excel_1900_formatter(fuzzy_date)
144
- # handle Lotus 123 bug has 1900 as a leap year
145
- Date.civil(1899, 12, 31) + fuzzy_date.to_i - 1 if fuzzy_date.to_i > 0
146
- end
147
-
148
- def regular_formatter(fuzzy_date, date_format)
149
- # We have a date format - oh so pretty, but...
150
- date_string = fuzzy_date
151
- # normalize delimiters to hyphens so we do not have to make a format for each one.
152
- # delimiters can be letters when its all numbers and delimiters only when there are letters. Sigh.
153
- # only if no format where provided
154
- date_string = date_string.gsub(/[^\d\w]+/, '-') if @settings[:format].nil?
155
-
156
- #epoch date string
157
- if date_format == 'epoch'
158
- news = Time.at(date_string.to_i).utc.to_s.match(/\d\d\d\d-\d\d-\d\d/)
159
- formatted_date = DateTime.strptime(news.to_s, '%Y-%m-%d').to_date
160
- else
161
- if date_string.to_s =~ /^(\w{3})\D(\d{2})$/
162
- century = $2.to_i < 25 ? '20' : '19'
163
- date_string = "#{$1} #{century}#{$2}"
164
- formatted_date = DateTime.strptime(date_string.to_s, '%b %Y').to_date
165
- else
166
- formatted_date = DateTime.strptime(date_string.to_s, date_format).to_date
167
- end
168
- end
169
- formatted_date+=4 if date_format == '%GW%V' #strptime makes dates on Mondays. We want Fridays.
170
- formatted_date
171
- end
172
-
173
-
174
-
175
- #find good example of date to use as template for format
176
- def find_good_date(all_dates)
177
- good_sample=nil
178
- all_dates.each do |fuzzy_date|
179
- if usable_cell(fuzzy_date)
180
- good_sample = fuzzy_date
181
- break
182
- end
183
- end
184
- good_sample
185
- end
186
-
187
- def usable_cell(cell)
188
- return false if cell.nil? || cell.to_s.empty?
189
- return false if cell.to_s.size > 20 # even annotated date can't be bigger than 20
190
-
191
- return true if cell.to_s =~ /^\w{3}\D[456789]\d$/
192
- # date is not usable as an example if it is ambiguous as to day and month
193
- # 03/04/2012, for example, is ambiguous. 03/17/2012 is NOT ambiguous
194
- if re = cell.to_s.match(/^(\d{1,2})\D(\d{1,2})\D\d{2,4}/) # e.g. 03/04/2012
195
- if re[1].to_i <= 12 and re[2].to_i <= 12
196
- return false
197
- else
198
- return true
199
- end
200
- end
201
-
202
- if re = cell.to_s.match(/^(\d{1,2})\D\w{3}\D(\d{2})/) # 07-jun-07
203
- if re[1].to_i <= 12 and re[2].to_i <= 12
204
- return false
205
- else
206
- return true
207
- end
208
- end
209
-
210
- return true if cell.to_s =~ /\d{4}/ # It has a 4 digit year somewhere
211
-
212
- return true if cell.to_s =~ /^\w{3}\D\d{2}/ # %b-%y(d)..also not ambiguous
213
-
214
- false # Thank you, come again
215
- end
216
-
217
- # Bump date to the end of the respective periods
218
- def frequency_transform(date, frequency)
219
- case frequency
220
- when 'annual'
221
- date = Date.new(date.year,12,31)
222
- when 'quarterly'
223
- month = 3*((date.month-1)/3 + 1) # equals 3,6,9 or 12
224
- date = Date.new(date.year, month, 1).next_month-1
225
- when 'monthly'
226
- date = Date.new(date.year, date.month,1).next_month-1
227
- else
228
- # Do nothing for daily or weekly
229
- end
230
-
231
- date
232
- end
233
-
234
-
235
- end
236
- end
237
- end
1
+ module Quandl
2
+ module Babelfish
3
+
4
+ #responsible for number formatting
5
+ class DateMaid
6
+ @defaults = {
7
+ :format => nil
8
+ }
9
+
10
+ @settings = @defaults #init with defaults
11
+
12
+ class << self
13
+
14
+ def init(user_settings)
15
+ @settings=@defaults.merge(user_settings)
16
+ end
17
+
18
+ #looks at all the dates and formats them to unambiguous ISO 8601 format (yyyy-mm-dd)
19
+ def sweep(all_dates)
20
+ return nil if all_dates.nil?
21
+
22
+ all_dates = disinfect all_dates
23
+
24
+ if @settings[:format].nil?
25
+ #find good example and extract all info from it and apply it to each of the dates in the set
26
+ good_sample = find_good_date(all_dates)
27
+
28
+ raise( Error::GuessDateFormat.new, "Unable to find date format for provided dates" ) if good_sample.nil?
29
+
30
+ date_format, frequency = analyze_date_format(good_sample)
31
+
32
+
33
+ else
34
+ date_format = @settings[:format]
35
+ end
36
+
37
+ iso_dates=[]
38
+ all_dates.each_with_index do |fuzzy_date, i|
39
+ temp_date = convert(fuzzy_date, date_format) rescue raise( Error::InvalidDate.new( line: i+1, row: fuzzy_date, context: 'convert' ), "Invalid date '#{fuzzy_date}'" )
40
+ iso_dates << frequency_transform(temp_date, frequency)
41
+ end
42
+
43
+ iso_dates
44
+ end
45
+
46
+ def analyze_date_format(example)
47
+ return nil if example.nil?
48
+
49
+ # Regular formats and Custom formats (where Date.parse and Date.strptime
50
+ # fear to tread)
51
+ if re = example.match(/^(\d{1,2})\D(\d{1,2})\D\d{4}/) # eg "07/03/2012"
52
+ if re[1].to_i > 12
53
+ return '%d-%m-%Y', nil
54
+ else
55
+ return '%m-%d-%Y', nil
56
+ end
57
+ end
58
+ if re = example.match(/^(\d{1,2})\D(\d{1,2})\D\d{2}/) # eg "07/03/12"
59
+ if re[1].to_i > 12
60
+ return '%d-%m-%y', nil
61
+ else
62
+ return '%m-%d-%y', nil
63
+ end
64
+ end
65
+ # order these guys from most specific to most general
66
+ return "%Y", "annual" if example =~ /^\d{4}[\s]?-[\s]?\d{4}$/
67
+ return '%Y%m%d', 'daily' if example =~ /^\d{8}$/ && example[4..5].to_i < 13 && example[6..7].to_i < 32 # precisely 8 digits - yyyymmdd
68
+ return 'epoch', 'daily' if example =~ /^\d{7}.*$/ # 7 or more digits - epoch
69
+ return '%Y', 'annual' if example =~ /^\d{4}$/ # 4 digits
70
+ return '%Y', 'annual' if example =~ /^\d{4}\.0$/ # 4 digits with a dot 0 for excel
71
+ return ':year_quarter', 'quarterly' if example =~ /^\d{4}[Qq]\d$/ # 4 digits, Q, digit (here because the next pattern would override it)
72
+ return '%YM%m', 'monthly' if example =~ /^\d{4}M\d{1,2}$/ # 2007M08
73
+ return '%GW%V', 'weekly' if example =~ /^\d{4}W\d{1,2}$/ # 2012W01
74
+ return '%Y-%m', 'monthly' if example =~ /^\d{4}\D\d{1,2}$/ # 4 digits, separator, 1-2 digits
75
+ return '%m-%Y', 'monthly' if example =~ /^\d{1,2}\D\d{4}$/ # 1-2 digits, separator, 4 digits
76
+ return '%Y%m', 'monthly' if example =~ /^\d{6}$/ # 6 digits
77
+ return '%Y-%b', 'monthly' if example =~ /^\d{4}\D\w{3}$/ # 4 digits, separator, 3 letters
78
+ return '%b-%Y', 'monthly' if example =~ /^\w{3}\D\d{4}$/ # 3 letters, separator, 4 digits
79
+ return '%b-%y', 'monthly' if example =~ /^\w{3}\D\d{2}$/ # 3 letters, separator, 2 digits
80
+ return '%Y%b', 'monthly' if example =~ /^\d{4}\w{3}$/ # 4 digits, 3 letters
81
+ return '%b%Y', 'monthly' if example =~ /^\w{3}\d{4}$/ # 3 letters, 4 digits
82
+ return '%Y-%b-%d', 'daily' if example =~ /^\d{4}\D\w{3}\D\d{1,2}$/ # 4 digits, separator, 3 letters, separator, 1-2 digits
83
+ return '%Y-%m-%d', 'daily' if example =~ /^\d{4}\D\d{1,2}\D\d{1,2}$/ # 4 digits, separator, 1-2 digits, separator, 1-2 digits
84
+ return '%d-%b-%Y', 'daily' if example =~ /^\d{1,2}\D\w{3}\D\d{4}$/ # 1-2 digits, separator, 3 letters, separator, 4 digits
85
+ return '%Y%b%d', 'daily' if example =~ /^\d{4}\w{3}\d{1,2}$/ # 4 digits, 3 letters, 1-2 digits
86
+ return '%d%b%Y', 'daily' if example =~ /^\d{1,2}\w{3}\d{4}$/ # 1-2 digits, 3 letters, 4 digits
87
+ return '%d-%b-%y', 'daily' if example =~ /^\d{1,2}\D\w{3}\D\d{2}$/ # 1-2 digits, 3 letters, 2 digits
88
+ return '%b-%d-%Y', 'daily' if example =~ /^\w{3}\D\d{1,2}\D{1,2}\d{4}$/ # 3 letters, separator, 1-2 digits, separator(s), 4 digits
89
+
90
+ #our custom formats
91
+ return ':year_quarter', 'quarterly' if example =~ /^\d{4}\D[Qq]\d$/ # 4 digits, separator, Q, digit
92
+ return ':excel-1900', 'daily' if example =~ /^\d{5}$/ # 5 digits
93
+ return ':excel-1900', 'daily' if example =~ /^\d{5}\.0$/ # 5 digits dot zero excel
94
+
95
+ # No, try default date parse
96
+ # raise PostProcessorException, "Unable to guess date format for #{example}"
97
+ [nil, nil]
98
+ end
99
+
100
+ def disinfect(dates)
101
+ [*dates].collect do |date|
102
+ date.to_s.encode!('UTF-8', 'UTF-8', :invalid => :replace)
103
+ date.to_s.gsub!(/[^\x01-\x7f]/,'')
104
+ date.to_s.strip.gsub(/\s\s+/, ' ')
105
+ end
106
+ end
107
+ private
108
+
109
+
110
+ #converts date to specified format
111
+ def convert(fuzzy_date, date_format)
112
+ if date_format.nil?
113
+ # Assuming a US date format with 3 parameters (i.e. MM?DD?YYYY)
114
+ tokens = fuzzy_date.split(/\D/)
115
+ if tokens[0].length > 2 || fuzzy_date =~ /\w{2}/
116
+ # Its ISO
117
+ return DateTime.parse(fuzzy_date.to_s).to_date
118
+ else
119
+ # Guessing US
120
+ return Date.new(tokens[2].to_i, tokens[0].to_i, tokens[1].to_i)
121
+ end
122
+ else
123
+ case date_format
124
+ when ':year_quarter'
125
+ return year_quarter_formatter(fuzzy_date)
126
+ when ':excel-1900'
127
+ return excel_1900_formatter(fuzzy_date)
128
+ else #regular ruby formatter
129
+ return regular_formatter(fuzzy_date, date_format)
130
+ end
131
+
132
+ end
133
+ end
134
+
135
+
136
+ def year_quarter_formatter(fuzzy_date)
137
+ raw_date = fuzzy_date
138
+ tokens = raw_date.gsub(/[qQ]/, '-').gsub(/[a-zA-Z]/, '').split(/[^0-9]/)
139
+ tokens.delete_if {|x| x.nil? || x.empty?} # In case there are more than one delimiter because we replaced the Q
140
+ Date.new(tokens[0].to_i, tokens[1].to_i * 3, 1)
141
+ end
142
+
143
+ def excel_1900_formatter(fuzzy_date)
144
+ # handle Lotus 123 bug has 1900 as a leap year
145
+ Date.civil(1899, 12, 31) + fuzzy_date.to_i - 1 if fuzzy_date.to_i > 0
146
+ end
147
+
148
+ def regular_formatter(fuzzy_date, date_format)
149
+ # We have a date format - oh so pretty, but...
150
+ date_string = fuzzy_date
151
+ # normalize delimiters to hyphens so we do not have to make a format for each one.
152
+ # delimiters can be letters when its all numbers and delimiters only when there are letters. Sigh.
153
+ # only if no format where provided
154
+ date_string = date_string.gsub(/[^\d\w]+/, '-') if @settings[:format].nil?
155
+
156
+ #epoch date string
157
+ if date_format == 'epoch'
158
+ news = Time.at(date_string.to_i).utc.to_s.match(/\d\d\d\d-\d\d-\d\d/)
159
+ formatted_date = DateTime.strptime(news.to_s, '%Y-%m-%d').to_date
160
+ else
161
+ if date_string.to_s =~ /^(\w{3})\D(\d{2})$/
162
+ century = $2.to_i < 25 ? '20' : '19'
163
+ date_string = "#{$1} #{century}#{$2}"
164
+ formatted_date = DateTime.strptime(date_string.to_s, '%b %Y').to_date
165
+ else
166
+ formatted_date = DateTime.strptime(date_string.to_s, date_format).to_date
167
+ end
168
+ end
169
+ formatted_date+=4 if date_format == '%GW%V' #strptime makes dates on Mondays. We want Fridays.
170
+ formatted_date
171
+ end
172
+
173
+
174
+
175
+ #find good example of date to use as template for format
176
+ def find_good_date(all_dates)
177
+ good_sample=nil
178
+ all_dates.each do |fuzzy_date|
179
+ if usable_cell(fuzzy_date)
180
+ good_sample = fuzzy_date
181
+ break
182
+ end
183
+ end
184
+ good_sample
185
+ end
186
+
187
+ def usable_cell(cell)
188
+ return false if cell.nil? || cell.to_s.empty?
189
+ return false if cell.to_s.size > 20 # even annotated date can't be bigger than 20
190
+
191
+ return true if cell.to_s =~ /^\w{3}\D[456789]\d$/
192
+ # date is not usable as an example if it is ambiguous as to day and month
193
+ # 03/04/2012, for example, is ambiguous. 03/17/2012 is NOT ambiguous
194
+ if re = cell.to_s.match(/^(\d{1,2})\D(\d{1,2})\D\d{2,4}/) # e.g. 03/04/2012
195
+ if re[1].to_i <= 12 and re[2].to_i <= 12
196
+ return false
197
+ else
198
+ return true
199
+ end
200
+ end
201
+
202
+ if re = cell.to_s.match(/^(\d{1,2})\D\w{3}\D(\d{2})/) # 07-jun-07
203
+ if re[1].to_i <= 12 and re[2].to_i <= 12
204
+ return false
205
+ else
206
+ return true
207
+ end
208
+ end
209
+
210
+ return true if cell.to_s =~ /\d{4}/ # It has a 4 digit year somewhere
211
+
212
+ return true if cell.to_s =~ /^\w{3}\D\d{2}/ # %b-%y(d)..also not ambiguous
213
+
214
+ false # Thank you, come again
215
+ end
216
+
217
+ # Bump date to the end of the respective periods
218
+ def frequency_transform(date, frequency)
219
+ case frequency
220
+ when 'annual'
221
+ date = Date.new(date.year,12,31)
222
+ when 'quarterly'
223
+ month = 3*((date.month-1)/3 + 1) # equals 3,6,9 or 12
224
+ date = Date.new(date.year, month, 1).next_month-1
225
+ when 'monthly'
226
+ date = Date.new(date.year, date.month,1).next_month-1
227
+ else
228
+ # Do nothing for daily or weekly
229
+ end
230
+
231
+ date
232
+ end
233
+
234
+
235
+ end
236
+ end
237
+ end
238
238
  end
@@ -0,0 +1,9 @@
1
+ class Helper
2
+
3
+ # Actions expect a square table, make it so
4
+ def self.make_square(table)
5
+ longest_row = 0
6
+ table.each { |row| longest_row = [longest_row, row.length].max }
7
+ table.collect { |row| row += Array.new(longest_row - row.length, nil) }
8
+ end
9
+ end
@@ -1,80 +1,80 @@
1
- module Quandl
2
- module Babelfish
3
-
4
- #responsible for number cleaning
5
- class NumberMaid
6
- @defaults = {
7
- :decimal_mark => Regexp.escape('.'),
8
- :ignore_brackets => false, # Brackets ARE negative by default
9
- }
10
-
11
- @settings = @defaults #init with defaults
12
-
13
- class << self
14
-
15
-
16
- def init(user_settings)
17
- @settings=@defaults.merge(user_settings)
18
- @escaped_decimal = Regexp.escape(@settings[:decimal_mark])
19
- end
20
-
21
- #cleans each number one by one
22
- def clean(dirty_numbers)
23
- return nil if dirty_numbers.nil?
24
- numbers=[]
25
- Array(dirty_numbers).each do |cell|
26
- numbers << cell_to_number(cell.to_s)
27
- end
28
-
29
- (numbers.size == 1) ? numbers[0] : numbers
30
- end
31
-
32
- def cell_to_number(num)
33
- return nil if num.nil?
34
- # Remove annotations
35
- # if there is something in parenthesis and a number elsewhere, nuke the parenthesis
36
- temp = num.gsub(/[\(\[\{].*[\)\}\]]/, '')
37
- num = temp if temp.match(/\d/)
38
-
39
- num.gsub!("est.", '')
40
-
41
- #check for exponents by searching for 'e' 'E' or variations of 'x 10' '*10' and 'X10^'
42
- is_exp = false
43
- expmultiplier = 1
44
- m = /(\s)*(E|e|[X|x|\*](\s)*10(\^)?)(\s)*/.match(num)
45
- #check if match is made, preceeded by a number/decimal, and succeeded by a digit or a plus/minus sign
46
- if !m.nil? and m.pre_match =~ /[0-9#{@escaped_decimal}]$/ and m.post_match =~ /^([\-+0-9])/
47
- is_exp = true
48
- num = m.pre_match
49
- expmultiplier = 10 ** /^[0-9\-+]*/.match(m.post_match)[0].to_i
50
- end
51
- is_million = (num =~ /million/i)
52
- is_billion = (num =~ /billion/i)
53
- is_negative = (num =~ /-[\d]/ or (!@settings[:ignore_brackets] and num =~ /\([\d]/))
54
-
55
- # watch out for two numbers, like a range eg "27.3 - 33.9"
56
- # how: if you a see a number followed by a non number char that is not the decimal marker, kill everything to the right of that
57
- num.gsub!(/(\d) (\d)/, '\1\2')
58
- if m = num.match(/-?\s*[,\d\.]+/)
59
- num = m[0]
60
- end
61
-
62
- # only keep #s and decimal mark
63
- num.gsub!(/[^0-9#{@escaped_decimal}]/, '')
64
- num.gsub!(/[^0-9]/, '.')
65
-
66
- return nil if num.nil? || num !~ /[\d]/
67
- return nil if num.end_with?(".")
68
- return nil if num.count(".") > 1
69
- cell = num.nil? ? 0.0 : Float("%.#{14}g" % num)
70
- cell *= 1e6 if is_million
71
- cell *= 1e9 if is_billion
72
- cell *= -1 if is_negative
73
- cell *= expmultiplier if is_exp
74
- cell
75
- end
76
-
77
- end
78
- end
79
- end
1
+ module Quandl
2
+ module Babelfish
3
+
4
+ #responsible for number cleaning
5
+ class NumberMaid
6
+ @defaults = {
7
+ :decimal_mark => Regexp.escape('.'),
8
+ :ignore_brackets => false, # Brackets ARE negative by default
9
+ }
10
+
11
+ @settings = @defaults #init with defaults
12
+
13
+ class << self
14
+
15
+
16
+ def init(user_settings)
17
+ @settings=@defaults.merge(user_settings)
18
+ @escaped_decimal = Regexp.escape(@settings[:decimal_mark])
19
+ end
20
+
21
+ #cleans each number one by one
22
+ def clean(dirty_numbers)
23
+ return nil if dirty_numbers.nil?
24
+ numbers=[]
25
+ Array(dirty_numbers).each do |cell|
26
+ numbers << cell_to_number(cell.to_s)
27
+ end
28
+
29
+ (numbers.size == 1) ? numbers[0] : numbers
30
+ end
31
+
32
+ def cell_to_number(num)
33
+ return nil if num.nil?
34
+ # Remove annotations
35
+ # if there is something in parenthesis and a number elsewhere, nuke the parenthesis
36
+ temp = num.gsub(/[\(\[\{].*[\)\}\]]/, '')
37
+ num = temp if temp.match(/\d/)
38
+
39
+ num.gsub!("est.", '')
40
+
41
+ #check for exponents by searching for 'e' 'E' or variations of 'x 10' '*10' and 'X10^'
42
+ is_exp = false
43
+ expmultiplier = 1
44
+ m = /(\s)*(E|e|[X|x|\*](\s)*10(\^)?)(\s)*/.match(num)
45
+ #check if match is made, preceeded by a number/decimal, and succeeded by a digit or a plus/minus sign
46
+ if !m.nil? and m.pre_match =~ /[0-9#{@escaped_decimal}]$/ and m.post_match =~ /^([\-+0-9])/
47
+ is_exp = true
48
+ num = m.pre_match
49
+ expmultiplier = 10 ** /^[0-9\-+]*/.match(m.post_match)[0].to_i
50
+ end
51
+ is_million = (num =~ /million/i)
52
+ is_billion = (num =~ /billion/i)
53
+ is_negative = (num =~ /-[\d]/ or (!@settings[:ignore_brackets] and num =~ /\([\d]/))
54
+
55
+ # watch out for two numbers, like a range eg "27.3 - 33.9"
56
+ # how: if you a see a number followed by a non number char that is not the decimal marker, kill everything to the right of that
57
+ num.gsub!(/(\d) (\d)/, '\1\2')
58
+ if m = num.match(/-?\s*[,\d\.]+/)
59
+ num = m[0]
60
+ end
61
+
62
+ # only keep #s and decimal mark
63
+ num.gsub!(/[^0-9#{@escaped_decimal}]/, '')
64
+ num.gsub!(/[^0-9]/, '.')
65
+
66
+ return nil if num.nil? || num !~ /[\d]/
67
+ return nil if num.end_with?(".")
68
+ return nil if num.count(".") > 1
69
+ cell = num.nil? ? 0.0 : Float("%.#{14}g" % num)
70
+ cell *= 1e6 if is_million
71
+ cell *= 1e9 if is_billion
72
+ cell *= -1 if is_negative
73
+ cell *= expmultiplier if is_exp
74
+ cell
75
+ end
76
+
77
+ end
78
+ end
79
+ end
80
80
  end
@@ -1,5 +1,5 @@
1
- module Quandl
2
- module Babelfish
3
- VERSION = '0.0.6'
4
- end
1
+ module Quandl
2
+ module Babelfish
3
+ VERSION = '0.0.7'
4
+ end
5
5
  end