quandl_babelfish 0.0.6 → 0.0.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +7 -7
- data/.travis.yml +12 -12
- data/Gemfile +1 -1
- data/LICENSE +7 -7
- data/README.md +18 -18
- data/UPGRADE.md +31 -29
- data/lib/quandl/babelfish/chronometer.rb +43 -43
- data/lib/quandl/babelfish/cleaner.rb +32 -32
- data/lib/quandl/babelfish/date_maid.rb +237 -237
- data/lib/quandl/babelfish/helper.rb +9 -0
- data/lib/quandl/babelfish/number_maid.rb +79 -79
- data/lib/quandl/babelfish/version.rb +4 -4
- data/lib/quandl/babelfish.rb +28 -27
- data/lib/quandl/error/guess_date_format.rb +4 -4
- data/lib/quandl/error/invalid_date.rb +4 -4
- data/lib/quandl/error/standard.rb +26 -26
- data/lib/quandl/error/unknown_date_format.rb +4 -4
- data/quandl_babelfish.gemspec +21 -21
- data/spec/lib/quandl/babelfish/chronometer_spec.rb +50 -50
- data/spec/lib/quandl/babelfish/cleaner_spec.rb +70 -49
- data/spec/lib/quandl/babelfish/date_maid_spec.rb +528 -528
- data/spec/lib/quandl/babelfish/helper_spec.rb +45 -0
- data/spec/lib/quandl/babelfish/number_maid_spec.rb +126 -126
- data/spec/lib/quandl/babelfish_spec.rb +15 -15
- data/spec/spec_helper.rb +12 -12
- data/spec/support/matchers/be_eq_at_index.rb +31 -31
- metadata +18 -17
- checksums.yaml +0 -7
@@ -1,238 +1,238 @@
|
|
1
|
-
module Quandl
|
2
|
-
module Babelfish
|
3
|
-
|
4
|
-
#responsible for number formatting
|
5
|
-
class DateMaid
|
6
|
-
@defaults = {
|
7
|
-
:format => nil
|
8
|
-
}
|
9
|
-
|
10
|
-
@settings = @defaults #init with defaults
|
11
|
-
|
12
|
-
class << self
|
13
|
-
|
14
|
-
def init(user_settings)
|
15
|
-
@settings=@defaults.merge(user_settings)
|
16
|
-
end
|
17
|
-
|
18
|
-
#looks at all the dates and formats them to unambiguous ISO 8601 format (yyyy-mm-dd)
|
19
|
-
def sweep(all_dates)
|
20
|
-
return nil if all_dates.nil?
|
21
|
-
|
22
|
-
all_dates = disinfect all_dates
|
23
|
-
|
24
|
-
if @settings[:format].nil?
|
25
|
-
#find good example and extract all info from it and apply it to each of the dates in the set
|
26
|
-
good_sample = find_good_date(all_dates)
|
27
|
-
|
28
|
-
raise( Error::GuessDateFormat.new, "Unable to find date format for provided dates" ) if good_sample.nil?
|
29
|
-
|
30
|
-
date_format, frequency = analyze_date_format(good_sample)
|
31
|
-
|
32
|
-
|
33
|
-
else
|
34
|
-
date_format = @settings[:format]
|
35
|
-
end
|
36
|
-
|
37
|
-
iso_dates=[]
|
38
|
-
all_dates.each_with_index do |fuzzy_date, i|
|
39
|
-
temp_date = convert(fuzzy_date, date_format) rescue raise( Error::InvalidDate.new( line: i+1, row: fuzzy_date, context: 'convert' ), "Invalid date '#{fuzzy_date}'" )
|
40
|
-
iso_dates << frequency_transform(temp_date, frequency)
|
41
|
-
end
|
42
|
-
|
43
|
-
iso_dates
|
44
|
-
end
|
45
|
-
|
46
|
-
def analyze_date_format(example)
|
47
|
-
return nil if example.nil?
|
48
|
-
|
49
|
-
# Regular formats and Custom formats (where Date.parse and Date.strptime
|
50
|
-
# fear to tread)
|
51
|
-
if re = example.match(/^(\d{1,2})\D(\d{1,2})\D\d{4}/) # eg "07/03/2012"
|
52
|
-
if re[1].to_i > 12
|
53
|
-
return '%d-%m-%Y', nil
|
54
|
-
else
|
55
|
-
return '%m-%d-%Y', nil
|
56
|
-
end
|
57
|
-
end
|
58
|
-
if re = example.match(/^(\d{1,2})\D(\d{1,2})\D\d{2}/) # eg "07/03/12"
|
59
|
-
if re[1].to_i > 12
|
60
|
-
return '%d-%m-%y', nil
|
61
|
-
else
|
62
|
-
return '%m-%d-%y', nil
|
63
|
-
end
|
64
|
-
end
|
65
|
-
# order these guys from most specific to most general
|
66
|
-
return "%Y", "annual" if example =~ /^\d{4}[\s]?-[\s]?\d{4}$/
|
67
|
-
return '%Y%m%d', 'daily' if example =~ /^\d{8}$/ && example[4..5].to_i < 13 && example[6..7].to_i < 32 # precisely 8 digits - yyyymmdd
|
68
|
-
return 'epoch', 'daily' if example =~ /^\d{7}.*$/ # 7 or more digits - epoch
|
69
|
-
return '%Y', 'annual' if example =~ /^\d{4}$/ # 4 digits
|
70
|
-
return '%Y', 'annual' if example =~ /^\d{4}\.0$/ # 4 digits with a dot 0 for excel
|
71
|
-
return ':year_quarter', 'quarterly' if example =~ /^\d{4}[Qq]\d$/ # 4 digits, Q, digit (here because the next pattern would override it)
|
72
|
-
return '%YM%m', 'monthly' if example =~ /^\d{4}M\d{1,2}$/ # 2007M08
|
73
|
-
return '%GW%V', 'weekly' if example =~ /^\d{4}W\d{1,2}$/ # 2012W01
|
74
|
-
return '%Y-%m', 'monthly' if example =~ /^\d{4}\D\d{1,2}$/ # 4 digits, separator, 1-2 digits
|
75
|
-
return '%m-%Y', 'monthly' if example =~ /^\d{1,2}\D\d{4}$/ # 1-2 digits, separator, 4 digits
|
76
|
-
return '%Y%m', 'monthly' if example =~ /^\d{6}$/ # 6 digits
|
77
|
-
return '%Y-%b', 'monthly' if example =~ /^\d{4}\D\w{3}$/ # 4 digits, separator, 3 letters
|
78
|
-
return '%b-%Y', 'monthly' if example =~ /^\w{3}\D\d{4}$/ # 3 letters, separator, 4 digits
|
79
|
-
return '%b-%y', 'monthly' if example =~ /^\w{3}\D\d{2}$/ # 3 letters, separator, 2 digits
|
80
|
-
return '%Y%b', 'monthly' if example =~ /^\d{4}\w{3}$/ # 4 digits, 3 letters
|
81
|
-
return '%b%Y', 'monthly' if example =~ /^\w{3}\d{4}$/ # 3 letters, 4 digits
|
82
|
-
return '%Y-%b-%d', 'daily' if example =~ /^\d{4}\D\w{3}\D\d{1,2}$/ # 4 digits, separator, 3 letters, separator, 1-2 digits
|
83
|
-
return '%Y-%m-%d', 'daily' if example =~ /^\d{4}\D\d{1,2}\D\d{1,2}$/ # 4 digits, separator, 1-2 digits, separator, 1-2 digits
|
84
|
-
return '%d-%b-%Y', 'daily' if example =~ /^\d{1,2}\D\w{3}\D\d{4}$/ # 1-2 digits, separator, 3 letters, separator, 4 digits
|
85
|
-
return '%Y%b%d', 'daily' if example =~ /^\d{4}\w{3}\d{1,2}$/ # 4 digits, 3 letters, 1-2 digits
|
86
|
-
return '%d%b%Y', 'daily' if example =~ /^\d{1,2}\w{3}\d{4}$/ # 1-2 digits, 3 letters, 4 digits
|
87
|
-
return '%d-%b-%y', 'daily' if example =~ /^\d{1,2}\D\w{3}\D\d{2}$/ # 1-2 digits, 3 letters, 2 digits
|
88
|
-
return '%b-%d-%Y', 'daily' if example =~ /^\w{3}\D\d{1,2}\D{1,2}\d{4}$/ # 3 letters, separator, 1-2 digits, separator(s), 4 digits
|
89
|
-
|
90
|
-
#our custom formats
|
91
|
-
return ':year_quarter', 'quarterly' if example =~ /^\d{4}\D[Qq]\d$/ # 4 digits, separator, Q, digit
|
92
|
-
return ':excel-1900', 'daily' if example =~ /^\d{5}$/ # 5 digits
|
93
|
-
return ':excel-1900', 'daily' if example =~ /^\d{5}\.0$/ # 5 digits dot zero excel
|
94
|
-
|
95
|
-
# No, try default date parse
|
96
|
-
# raise PostProcessorException, "Unable to guess date format for #{example}"
|
97
|
-
[nil, nil]
|
98
|
-
end
|
99
|
-
|
100
|
-
def disinfect(dates)
|
101
|
-
[*dates].collect do |date|
|
102
|
-
date.to_s.encode!('UTF-8', 'UTF-8', :invalid => :replace)
|
103
|
-
date.to_s.gsub!(/[^\x01-\x7f]/,'')
|
104
|
-
date.to_s.strip.gsub(/\s\s+/, ' ')
|
105
|
-
end
|
106
|
-
end
|
107
|
-
private
|
108
|
-
|
109
|
-
|
110
|
-
#converts date to specified format
|
111
|
-
def convert(fuzzy_date, date_format)
|
112
|
-
if date_format.nil?
|
113
|
-
# Assuming a US date format with 3 parameters (i.e. MM?DD?YYYY)
|
114
|
-
tokens = fuzzy_date.split(/\D/)
|
115
|
-
if tokens[0].length > 2 || fuzzy_date =~ /\w{2}/
|
116
|
-
# Its ISO
|
117
|
-
return DateTime.parse(fuzzy_date.to_s).to_date
|
118
|
-
else
|
119
|
-
# Guessing US
|
120
|
-
return Date.new(tokens[2].to_i, tokens[0].to_i, tokens[1].to_i)
|
121
|
-
end
|
122
|
-
else
|
123
|
-
case date_format
|
124
|
-
when ':year_quarter'
|
125
|
-
return year_quarter_formatter(fuzzy_date)
|
126
|
-
when ':excel-1900'
|
127
|
-
return excel_1900_formatter(fuzzy_date)
|
128
|
-
else #regular ruby formatter
|
129
|
-
return regular_formatter(fuzzy_date, date_format)
|
130
|
-
end
|
131
|
-
|
132
|
-
end
|
133
|
-
end
|
134
|
-
|
135
|
-
|
136
|
-
def year_quarter_formatter(fuzzy_date)
|
137
|
-
raw_date = fuzzy_date
|
138
|
-
tokens = raw_date.gsub(/[qQ]/, '-').gsub(/[a-zA-Z]/, '').split(/[^0-9]/)
|
139
|
-
tokens.delete_if {|x| x.nil? || x.empty?} # In case there are more than one delimiter because we replaced the Q
|
140
|
-
Date.new(tokens[0].to_i, tokens[1].to_i * 3, 1)
|
141
|
-
end
|
142
|
-
|
143
|
-
def excel_1900_formatter(fuzzy_date)
|
144
|
-
# handle Lotus 123 bug has 1900 as a leap year
|
145
|
-
Date.civil(1899, 12, 31) + fuzzy_date.to_i - 1 if fuzzy_date.to_i > 0
|
146
|
-
end
|
147
|
-
|
148
|
-
def regular_formatter(fuzzy_date, date_format)
|
149
|
-
# We have a date format - oh so pretty, but...
|
150
|
-
date_string = fuzzy_date
|
151
|
-
# normalize delimiters to hyphens so we do not have to make a format for each one.
|
152
|
-
# delimiters can be letters when its all numbers and delimiters only when there are letters. Sigh.
|
153
|
-
# only if no format where provided
|
154
|
-
date_string = date_string.gsub(/[^\d\w]+/, '-') if @settings[:format].nil?
|
155
|
-
|
156
|
-
#epoch date string
|
157
|
-
if date_format == 'epoch'
|
158
|
-
news = Time.at(date_string.to_i).utc.to_s.match(/\d\d\d\d-\d\d-\d\d/)
|
159
|
-
formatted_date = DateTime.strptime(news.to_s, '%Y-%m-%d').to_date
|
160
|
-
else
|
161
|
-
if date_string.to_s =~ /^(\w{3})\D(\d{2})$/
|
162
|
-
century = $2.to_i < 25 ? '20' : '19'
|
163
|
-
date_string = "#{$1} #{century}#{$2}"
|
164
|
-
formatted_date = DateTime.strptime(date_string.to_s, '%b %Y').to_date
|
165
|
-
else
|
166
|
-
formatted_date = DateTime.strptime(date_string.to_s, date_format).to_date
|
167
|
-
end
|
168
|
-
end
|
169
|
-
formatted_date+=4 if date_format == '%GW%V' #strptime makes dates on Mondays. We want Fridays.
|
170
|
-
formatted_date
|
171
|
-
end
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
#find good example of date to use as template for format
|
176
|
-
def find_good_date(all_dates)
|
177
|
-
good_sample=nil
|
178
|
-
all_dates.each do |fuzzy_date|
|
179
|
-
if usable_cell(fuzzy_date)
|
180
|
-
good_sample = fuzzy_date
|
181
|
-
break
|
182
|
-
end
|
183
|
-
end
|
184
|
-
good_sample
|
185
|
-
end
|
186
|
-
|
187
|
-
def usable_cell(cell)
|
188
|
-
return false if cell.nil? || cell.to_s.empty?
|
189
|
-
return false if cell.to_s.size > 20 # even annotated date can't be bigger than 20
|
190
|
-
|
191
|
-
return true if cell.to_s =~ /^\w{3}\D[456789]\d$/
|
192
|
-
# date is not usable as an example if it is ambiguous as to day and month
|
193
|
-
# 03/04/2012, for example, is ambiguous. 03/17/2012 is NOT ambiguous
|
194
|
-
if re = cell.to_s.match(/^(\d{1,2})\D(\d{1,2})\D\d{2,4}/) # e.g. 03/04/2012
|
195
|
-
if re[1].to_i <= 12 and re[2].to_i <= 12
|
196
|
-
return false
|
197
|
-
else
|
198
|
-
return true
|
199
|
-
end
|
200
|
-
end
|
201
|
-
|
202
|
-
if re = cell.to_s.match(/^(\d{1,2})\D\w{3}\D(\d{2})/) # 07-jun-07
|
203
|
-
if re[1].to_i <= 12 and re[2].to_i <= 12
|
204
|
-
return false
|
205
|
-
else
|
206
|
-
return true
|
207
|
-
end
|
208
|
-
end
|
209
|
-
|
210
|
-
return true if cell.to_s =~ /\d{4}/ # It has a 4 digit year somewhere
|
211
|
-
|
212
|
-
return true if cell.to_s =~ /^\w{3}\D\d{2}/ # %b-%y(d)..also not ambiguous
|
213
|
-
|
214
|
-
false # Thank you, come again
|
215
|
-
end
|
216
|
-
|
217
|
-
# Bump date to the end of the respective periods
|
218
|
-
def frequency_transform(date, frequency)
|
219
|
-
case frequency
|
220
|
-
when 'annual'
|
221
|
-
date = Date.new(date.year,12,31)
|
222
|
-
when 'quarterly'
|
223
|
-
month = 3*((date.month-1)/3 + 1) # equals 3,6,9 or 12
|
224
|
-
date = Date.new(date.year, month, 1).next_month-1
|
225
|
-
when 'monthly'
|
226
|
-
date = Date.new(date.year, date.month,1).next_month-1
|
227
|
-
else
|
228
|
-
# Do nothing for daily or weekly
|
229
|
-
end
|
230
|
-
|
231
|
-
date
|
232
|
-
end
|
233
|
-
|
234
|
-
|
235
|
-
end
|
236
|
-
end
|
237
|
-
end
|
1
|
+
module Quandl
|
2
|
+
module Babelfish
|
3
|
+
|
4
|
+
#responsible for number formatting
|
5
|
+
class DateMaid
|
6
|
+
@defaults = {
|
7
|
+
:format => nil
|
8
|
+
}
|
9
|
+
|
10
|
+
@settings = @defaults #init with defaults
|
11
|
+
|
12
|
+
class << self
|
13
|
+
|
14
|
+
def init(user_settings)
|
15
|
+
@settings=@defaults.merge(user_settings)
|
16
|
+
end
|
17
|
+
|
18
|
+
#looks at all the dates and formats them to unambiguous ISO 8601 format (yyyy-mm-dd)
|
19
|
+
def sweep(all_dates)
|
20
|
+
return nil if all_dates.nil?
|
21
|
+
|
22
|
+
all_dates = disinfect all_dates
|
23
|
+
|
24
|
+
if @settings[:format].nil?
|
25
|
+
#find good example and extract all info from it and apply it to each of the dates in the set
|
26
|
+
good_sample = find_good_date(all_dates)
|
27
|
+
|
28
|
+
raise( Error::GuessDateFormat.new, "Unable to find date format for provided dates" ) if good_sample.nil?
|
29
|
+
|
30
|
+
date_format, frequency = analyze_date_format(good_sample)
|
31
|
+
|
32
|
+
|
33
|
+
else
|
34
|
+
date_format = @settings[:format]
|
35
|
+
end
|
36
|
+
|
37
|
+
iso_dates=[]
|
38
|
+
all_dates.each_with_index do |fuzzy_date, i|
|
39
|
+
temp_date = convert(fuzzy_date, date_format) rescue raise( Error::InvalidDate.new( line: i+1, row: fuzzy_date, context: 'convert' ), "Invalid date '#{fuzzy_date}'" )
|
40
|
+
iso_dates << frequency_transform(temp_date, frequency)
|
41
|
+
end
|
42
|
+
|
43
|
+
iso_dates
|
44
|
+
end
|
45
|
+
|
46
|
+
def analyze_date_format(example)
|
47
|
+
return nil if example.nil?
|
48
|
+
|
49
|
+
# Regular formats and Custom formats (where Date.parse and Date.strptime
|
50
|
+
# fear to tread)
|
51
|
+
if re = example.match(/^(\d{1,2})\D(\d{1,2})\D\d{4}/) # eg "07/03/2012"
|
52
|
+
if re[1].to_i > 12
|
53
|
+
return '%d-%m-%Y', nil
|
54
|
+
else
|
55
|
+
return '%m-%d-%Y', nil
|
56
|
+
end
|
57
|
+
end
|
58
|
+
if re = example.match(/^(\d{1,2})\D(\d{1,2})\D\d{2}/) # eg "07/03/12"
|
59
|
+
if re[1].to_i > 12
|
60
|
+
return '%d-%m-%y', nil
|
61
|
+
else
|
62
|
+
return '%m-%d-%y', nil
|
63
|
+
end
|
64
|
+
end
|
65
|
+
# order these guys from most specific to most general
|
66
|
+
return "%Y", "annual" if example =~ /^\d{4}[\s]?-[\s]?\d{4}$/
|
67
|
+
return '%Y%m%d', 'daily' if example =~ /^\d{8}$/ && example[4..5].to_i < 13 && example[6..7].to_i < 32 # precisely 8 digits - yyyymmdd
|
68
|
+
return 'epoch', 'daily' if example =~ /^\d{7}.*$/ # 7 or more digits - epoch
|
69
|
+
return '%Y', 'annual' if example =~ /^\d{4}$/ # 4 digits
|
70
|
+
return '%Y', 'annual' if example =~ /^\d{4}\.0$/ # 4 digits with a dot 0 for excel
|
71
|
+
return ':year_quarter', 'quarterly' if example =~ /^\d{4}[Qq]\d$/ # 4 digits, Q, digit (here because the next pattern would override it)
|
72
|
+
return '%YM%m', 'monthly' if example =~ /^\d{4}M\d{1,2}$/ # 2007M08
|
73
|
+
return '%GW%V', 'weekly' if example =~ /^\d{4}W\d{1,2}$/ # 2012W01
|
74
|
+
return '%Y-%m', 'monthly' if example =~ /^\d{4}\D\d{1,2}$/ # 4 digits, separator, 1-2 digits
|
75
|
+
return '%m-%Y', 'monthly' if example =~ /^\d{1,2}\D\d{4}$/ # 1-2 digits, separator, 4 digits
|
76
|
+
return '%Y%m', 'monthly' if example =~ /^\d{6}$/ # 6 digits
|
77
|
+
return '%Y-%b', 'monthly' if example =~ /^\d{4}\D\w{3}$/ # 4 digits, separator, 3 letters
|
78
|
+
return '%b-%Y', 'monthly' if example =~ /^\w{3}\D\d{4}$/ # 3 letters, separator, 4 digits
|
79
|
+
return '%b-%y', 'monthly' if example =~ /^\w{3}\D\d{2}$/ # 3 letters, separator, 2 digits
|
80
|
+
return '%Y%b', 'monthly' if example =~ /^\d{4}\w{3}$/ # 4 digits, 3 letters
|
81
|
+
return '%b%Y', 'monthly' if example =~ /^\w{3}\d{4}$/ # 3 letters, 4 digits
|
82
|
+
return '%Y-%b-%d', 'daily' if example =~ /^\d{4}\D\w{3}\D\d{1,2}$/ # 4 digits, separator, 3 letters, separator, 1-2 digits
|
83
|
+
return '%Y-%m-%d', 'daily' if example =~ /^\d{4}\D\d{1,2}\D\d{1,2}$/ # 4 digits, separator, 1-2 digits, separator, 1-2 digits
|
84
|
+
return '%d-%b-%Y', 'daily' if example =~ /^\d{1,2}\D\w{3}\D\d{4}$/ # 1-2 digits, separator, 3 letters, separator, 4 digits
|
85
|
+
return '%Y%b%d', 'daily' if example =~ /^\d{4}\w{3}\d{1,2}$/ # 4 digits, 3 letters, 1-2 digits
|
86
|
+
return '%d%b%Y', 'daily' if example =~ /^\d{1,2}\w{3}\d{4}$/ # 1-2 digits, 3 letters, 4 digits
|
87
|
+
return '%d-%b-%y', 'daily' if example =~ /^\d{1,2}\D\w{3}\D\d{2}$/ # 1-2 digits, 3 letters, 2 digits
|
88
|
+
return '%b-%d-%Y', 'daily' if example =~ /^\w{3}\D\d{1,2}\D{1,2}\d{4}$/ # 3 letters, separator, 1-2 digits, separator(s), 4 digits
|
89
|
+
|
90
|
+
#our custom formats
|
91
|
+
return ':year_quarter', 'quarterly' if example =~ /^\d{4}\D[Qq]\d$/ # 4 digits, separator, Q, digit
|
92
|
+
return ':excel-1900', 'daily' if example =~ /^\d{5}$/ # 5 digits
|
93
|
+
return ':excel-1900', 'daily' if example =~ /^\d{5}\.0$/ # 5 digits dot zero excel
|
94
|
+
|
95
|
+
# No, try default date parse
|
96
|
+
# raise PostProcessorException, "Unable to guess date format for #{example}"
|
97
|
+
[nil, nil]
|
98
|
+
end
|
99
|
+
|
100
|
+
def disinfect(dates)
|
101
|
+
[*dates].collect do |date|
|
102
|
+
date.to_s.encode!('UTF-8', 'UTF-8', :invalid => :replace)
|
103
|
+
date.to_s.gsub!(/[^\x01-\x7f]/,'')
|
104
|
+
date.to_s.strip.gsub(/\s\s+/, ' ')
|
105
|
+
end
|
106
|
+
end
|
107
|
+
private
|
108
|
+
|
109
|
+
|
110
|
+
#converts date to specified format
|
111
|
+
def convert(fuzzy_date, date_format)
|
112
|
+
if date_format.nil?
|
113
|
+
# Assuming a US date format with 3 parameters (i.e. MM?DD?YYYY)
|
114
|
+
tokens = fuzzy_date.split(/\D/)
|
115
|
+
if tokens[0].length > 2 || fuzzy_date =~ /\w{2}/
|
116
|
+
# Its ISO
|
117
|
+
return DateTime.parse(fuzzy_date.to_s).to_date
|
118
|
+
else
|
119
|
+
# Guessing US
|
120
|
+
return Date.new(tokens[2].to_i, tokens[0].to_i, tokens[1].to_i)
|
121
|
+
end
|
122
|
+
else
|
123
|
+
case date_format
|
124
|
+
when ':year_quarter'
|
125
|
+
return year_quarter_formatter(fuzzy_date)
|
126
|
+
when ':excel-1900'
|
127
|
+
return excel_1900_formatter(fuzzy_date)
|
128
|
+
else #regular ruby formatter
|
129
|
+
return regular_formatter(fuzzy_date, date_format)
|
130
|
+
end
|
131
|
+
|
132
|
+
end
|
133
|
+
end
|
134
|
+
|
135
|
+
|
136
|
+
def year_quarter_formatter(fuzzy_date)
|
137
|
+
raw_date = fuzzy_date
|
138
|
+
tokens = raw_date.gsub(/[qQ]/, '-').gsub(/[a-zA-Z]/, '').split(/[^0-9]/)
|
139
|
+
tokens.delete_if {|x| x.nil? || x.empty?} # In case there are more than one delimiter because we replaced the Q
|
140
|
+
Date.new(tokens[0].to_i, tokens[1].to_i * 3, 1)
|
141
|
+
end
|
142
|
+
|
143
|
+
def excel_1900_formatter(fuzzy_date)
|
144
|
+
# handle Lotus 123 bug has 1900 as a leap year
|
145
|
+
Date.civil(1899, 12, 31) + fuzzy_date.to_i - 1 if fuzzy_date.to_i > 0
|
146
|
+
end
|
147
|
+
|
148
|
+
def regular_formatter(fuzzy_date, date_format)
|
149
|
+
# We have a date format - oh so pretty, but...
|
150
|
+
date_string = fuzzy_date
|
151
|
+
# normalize delimiters to hyphens so we do not have to make a format for each one.
|
152
|
+
# delimiters can be letters when its all numbers and delimiters only when there are letters. Sigh.
|
153
|
+
# only if no format where provided
|
154
|
+
date_string = date_string.gsub(/[^\d\w]+/, '-') if @settings[:format].nil?
|
155
|
+
|
156
|
+
#epoch date string
|
157
|
+
if date_format == 'epoch'
|
158
|
+
news = Time.at(date_string.to_i).utc.to_s.match(/\d\d\d\d-\d\d-\d\d/)
|
159
|
+
formatted_date = DateTime.strptime(news.to_s, '%Y-%m-%d').to_date
|
160
|
+
else
|
161
|
+
if date_string.to_s =~ /^(\w{3})\D(\d{2})$/
|
162
|
+
century = $2.to_i < 25 ? '20' : '19'
|
163
|
+
date_string = "#{$1} #{century}#{$2}"
|
164
|
+
formatted_date = DateTime.strptime(date_string.to_s, '%b %Y').to_date
|
165
|
+
else
|
166
|
+
formatted_date = DateTime.strptime(date_string.to_s, date_format).to_date
|
167
|
+
end
|
168
|
+
end
|
169
|
+
formatted_date+=4 if date_format == '%GW%V' #strptime makes dates on Mondays. We want Fridays.
|
170
|
+
formatted_date
|
171
|
+
end
|
172
|
+
|
173
|
+
|
174
|
+
|
175
|
+
#find good example of date to use as template for format
|
176
|
+
def find_good_date(all_dates)
|
177
|
+
good_sample=nil
|
178
|
+
all_dates.each do |fuzzy_date|
|
179
|
+
if usable_cell(fuzzy_date)
|
180
|
+
good_sample = fuzzy_date
|
181
|
+
break
|
182
|
+
end
|
183
|
+
end
|
184
|
+
good_sample
|
185
|
+
end
|
186
|
+
|
187
|
+
def usable_cell(cell)
|
188
|
+
return false if cell.nil? || cell.to_s.empty?
|
189
|
+
return false if cell.to_s.size > 20 # even annotated date can't be bigger than 20
|
190
|
+
|
191
|
+
return true if cell.to_s =~ /^\w{3}\D[456789]\d$/
|
192
|
+
# date is not usable as an example if it is ambiguous as to day and month
|
193
|
+
# 03/04/2012, for example, is ambiguous. 03/17/2012 is NOT ambiguous
|
194
|
+
if re = cell.to_s.match(/^(\d{1,2})\D(\d{1,2})\D\d{2,4}/) # e.g. 03/04/2012
|
195
|
+
if re[1].to_i <= 12 and re[2].to_i <= 12
|
196
|
+
return false
|
197
|
+
else
|
198
|
+
return true
|
199
|
+
end
|
200
|
+
end
|
201
|
+
|
202
|
+
if re = cell.to_s.match(/^(\d{1,2})\D\w{3}\D(\d{2})/) # 07-jun-07
|
203
|
+
if re[1].to_i <= 12 and re[2].to_i <= 12
|
204
|
+
return false
|
205
|
+
else
|
206
|
+
return true
|
207
|
+
end
|
208
|
+
end
|
209
|
+
|
210
|
+
return true if cell.to_s =~ /\d{4}/ # It has a 4 digit year somewhere
|
211
|
+
|
212
|
+
return true if cell.to_s =~ /^\w{3}\D\d{2}/ # %b-%y(d)..also not ambiguous
|
213
|
+
|
214
|
+
false # Thank you, come again
|
215
|
+
end
|
216
|
+
|
217
|
+
# Bump date to the end of the respective periods
|
218
|
+
def frequency_transform(date, frequency)
|
219
|
+
case frequency
|
220
|
+
when 'annual'
|
221
|
+
date = Date.new(date.year,12,31)
|
222
|
+
when 'quarterly'
|
223
|
+
month = 3*((date.month-1)/3 + 1) # equals 3,6,9 or 12
|
224
|
+
date = Date.new(date.year, month, 1).next_month-1
|
225
|
+
when 'monthly'
|
226
|
+
date = Date.new(date.year, date.month,1).next_month-1
|
227
|
+
else
|
228
|
+
# Do nothing for daily or weekly
|
229
|
+
end
|
230
|
+
|
231
|
+
date
|
232
|
+
end
|
233
|
+
|
234
|
+
|
235
|
+
end
|
236
|
+
end
|
237
|
+
end
|
238
238
|
end
|
@@ -1,80 +1,80 @@
|
|
1
|
-
module Quandl
|
2
|
-
module Babelfish
|
3
|
-
|
4
|
-
#responsible for number cleaning
|
5
|
-
class NumberMaid
|
6
|
-
@defaults = {
|
7
|
-
:decimal_mark => Regexp.escape('.'),
|
8
|
-
:ignore_brackets => false, # Brackets ARE negative by default
|
9
|
-
}
|
10
|
-
|
11
|
-
@settings = @defaults #init with defaults
|
12
|
-
|
13
|
-
class << self
|
14
|
-
|
15
|
-
|
16
|
-
def init(user_settings)
|
17
|
-
@settings=@defaults.merge(user_settings)
|
18
|
-
@escaped_decimal = Regexp.escape(@settings[:decimal_mark])
|
19
|
-
end
|
20
|
-
|
21
|
-
#cleans each number one by one
|
22
|
-
def clean(dirty_numbers)
|
23
|
-
return nil if dirty_numbers.nil?
|
24
|
-
numbers=[]
|
25
|
-
Array(dirty_numbers).each do |cell|
|
26
|
-
numbers << cell_to_number(cell.to_s)
|
27
|
-
end
|
28
|
-
|
29
|
-
(numbers.size == 1) ? numbers[0] : numbers
|
30
|
-
end
|
31
|
-
|
32
|
-
def cell_to_number(num)
|
33
|
-
return nil if num.nil?
|
34
|
-
# Remove annotations
|
35
|
-
# if there is something in parenthesis and a number elsewhere, nuke the parenthesis
|
36
|
-
temp = num.gsub(/[\(\[\{].*[\)\}\]]/, '')
|
37
|
-
num = temp if temp.match(/\d/)
|
38
|
-
|
39
|
-
num.gsub!("est.", '')
|
40
|
-
|
41
|
-
#check for exponents by searching for 'e' 'E' or variations of 'x 10' '*10' and 'X10^'
|
42
|
-
is_exp = false
|
43
|
-
expmultiplier = 1
|
44
|
-
m = /(\s)*(E|e|[X|x|\*](\s)*10(\^)?)(\s)*/.match(num)
|
45
|
-
#check if match is made, preceeded by a number/decimal, and succeeded by a digit or a plus/minus sign
|
46
|
-
if !m.nil? and m.pre_match =~ /[0-9#{@escaped_decimal}]$/ and m.post_match =~ /^([\-+0-9])/
|
47
|
-
is_exp = true
|
48
|
-
num = m.pre_match
|
49
|
-
expmultiplier = 10 ** /^[0-9\-+]*/.match(m.post_match)[0].to_i
|
50
|
-
end
|
51
|
-
is_million = (num =~ /million/i)
|
52
|
-
is_billion = (num =~ /billion/i)
|
53
|
-
is_negative = (num =~ /-[\d]/ or (!@settings[:ignore_brackets] and num =~ /\([\d]/))
|
54
|
-
|
55
|
-
# watch out for two numbers, like a range eg "27.3 - 33.9"
|
56
|
-
# how: if you a see a number followed by a non number char that is not the decimal marker, kill everything to the right of that
|
57
|
-
num.gsub!(/(\d) (\d)/, '\1\2')
|
58
|
-
if m = num.match(/-?\s*[,\d\.]+/)
|
59
|
-
num = m[0]
|
60
|
-
end
|
61
|
-
|
62
|
-
# only keep #s and decimal mark
|
63
|
-
num.gsub!(/[^0-9#{@escaped_decimal}]/, '')
|
64
|
-
num.gsub!(/[^0-9]/, '.')
|
65
|
-
|
66
|
-
return nil if num.nil? || num !~ /[\d]/
|
67
|
-
return nil if num.end_with?(".")
|
68
|
-
return nil if num.count(".") > 1
|
69
|
-
cell = num.nil? ? 0.0 : Float("%.#{14}g" % num)
|
70
|
-
cell *= 1e6 if is_million
|
71
|
-
cell *= 1e9 if is_billion
|
72
|
-
cell *= -1 if is_negative
|
73
|
-
cell *= expmultiplier if is_exp
|
74
|
-
cell
|
75
|
-
end
|
76
|
-
|
77
|
-
end
|
78
|
-
end
|
79
|
-
end
|
1
|
+
module Quandl
|
2
|
+
module Babelfish
|
3
|
+
|
4
|
+
#responsible for number cleaning
|
5
|
+
class NumberMaid
|
6
|
+
@defaults = {
|
7
|
+
:decimal_mark => Regexp.escape('.'),
|
8
|
+
:ignore_brackets => false, # Brackets ARE negative by default
|
9
|
+
}
|
10
|
+
|
11
|
+
@settings = @defaults #init with defaults
|
12
|
+
|
13
|
+
class << self
|
14
|
+
|
15
|
+
|
16
|
+
def init(user_settings)
|
17
|
+
@settings=@defaults.merge(user_settings)
|
18
|
+
@escaped_decimal = Regexp.escape(@settings[:decimal_mark])
|
19
|
+
end
|
20
|
+
|
21
|
+
#cleans each number one by one
|
22
|
+
def clean(dirty_numbers)
|
23
|
+
return nil if dirty_numbers.nil?
|
24
|
+
numbers=[]
|
25
|
+
Array(dirty_numbers).each do |cell|
|
26
|
+
numbers << cell_to_number(cell.to_s)
|
27
|
+
end
|
28
|
+
|
29
|
+
(numbers.size == 1) ? numbers[0] : numbers
|
30
|
+
end
|
31
|
+
|
32
|
+
def cell_to_number(num)
|
33
|
+
return nil if num.nil?
|
34
|
+
# Remove annotations
|
35
|
+
# if there is something in parenthesis and a number elsewhere, nuke the parenthesis
|
36
|
+
temp = num.gsub(/[\(\[\{].*[\)\}\]]/, '')
|
37
|
+
num = temp if temp.match(/\d/)
|
38
|
+
|
39
|
+
num.gsub!("est.", '')
|
40
|
+
|
41
|
+
#check for exponents by searching for 'e' 'E' or variations of 'x 10' '*10' and 'X10^'
|
42
|
+
is_exp = false
|
43
|
+
expmultiplier = 1
|
44
|
+
m = /(\s)*(E|e|[X|x|\*](\s)*10(\^)?)(\s)*/.match(num)
|
45
|
+
#check if match is made, preceeded by a number/decimal, and succeeded by a digit or a plus/minus sign
|
46
|
+
if !m.nil? and m.pre_match =~ /[0-9#{@escaped_decimal}]$/ and m.post_match =~ /^([\-+0-9])/
|
47
|
+
is_exp = true
|
48
|
+
num = m.pre_match
|
49
|
+
expmultiplier = 10 ** /^[0-9\-+]*/.match(m.post_match)[0].to_i
|
50
|
+
end
|
51
|
+
is_million = (num =~ /million/i)
|
52
|
+
is_billion = (num =~ /billion/i)
|
53
|
+
is_negative = (num =~ /-[\d]/ or (!@settings[:ignore_brackets] and num =~ /\([\d]/))
|
54
|
+
|
55
|
+
# watch out for two numbers, like a range eg "27.3 - 33.9"
|
56
|
+
# how: if you a see a number followed by a non number char that is not the decimal marker, kill everything to the right of that
|
57
|
+
num.gsub!(/(\d) (\d)/, '\1\2')
|
58
|
+
if m = num.match(/-?\s*[,\d\.]+/)
|
59
|
+
num = m[0]
|
60
|
+
end
|
61
|
+
|
62
|
+
# only keep #s and decimal mark
|
63
|
+
num.gsub!(/[^0-9#{@escaped_decimal}]/, '')
|
64
|
+
num.gsub!(/[^0-9]/, '.')
|
65
|
+
|
66
|
+
return nil if num.nil? || num !~ /[\d]/
|
67
|
+
return nil if num.end_with?(".")
|
68
|
+
return nil if num.count(".") > 1
|
69
|
+
cell = num.nil? ? 0.0 : Float("%.#{14}g" % num)
|
70
|
+
cell *= 1e6 if is_million
|
71
|
+
cell *= 1e9 if is_billion
|
72
|
+
cell *= -1 if is_negative
|
73
|
+
cell *= expmultiplier if is_exp
|
74
|
+
cell
|
75
|
+
end
|
76
|
+
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
80
80
|
end
|
@@ -1,5 +1,5 @@
|
|
1
|
-
module Quandl
|
2
|
-
module Babelfish
|
3
|
-
VERSION = '0.0.
|
4
|
-
end
|
1
|
+
module Quandl
|
2
|
+
module Babelfish
|
3
|
+
VERSION = '0.0.7'
|
4
|
+
end
|
5
5
|
end
|