quandl_babelfish 0.0.7 → 0.0.8
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +7 -7
- data/.travis.yml +12 -12
- data/Gemfile +1 -1
- data/LICENSE +7 -7
- data/README.md +18 -18
- data/UPGRADE.md +38 -31
- data/lib/quandl/babelfish.rb +28 -28
- data/lib/quandl/babelfish/chronometer.rb +43 -43
- data/lib/quandl/babelfish/cleaner.rb +33 -32
- data/lib/quandl/babelfish/date_maid.rb +237 -237
- data/lib/quandl/babelfish/helper.rb +8 -8
- data/lib/quandl/babelfish/number_maid.rb +79 -79
- data/lib/quandl/babelfish/version.rb +4 -4
- data/lib/quandl/error/guess_date_format.rb +4 -4
- data/lib/quandl/error/invalid_date.rb +4 -4
- data/lib/quandl/error/standard.rb +26 -26
- data/lib/quandl/error/unknown_date_format.rb +4 -4
- data/quandl_babelfish.gemspec +21 -21
- data/spec/lib/quandl/babelfish/chronometer_spec.rb +50 -50
- data/spec/lib/quandl/babelfish/cleaner_spec.rb +70 -70
- data/spec/lib/quandl/babelfish/date_maid_spec.rb +528 -528
- data/spec/lib/quandl/babelfish/helper_spec.rb +44 -44
- data/spec/lib/quandl/babelfish/number_maid_spec.rb +126 -126
- data/spec/lib/quandl/babelfish_spec.rb +15 -15
- data/spec/spec_helper.rb +12 -12
- data/spec/support/matchers/be_eq_at_index.rb +31 -31
- metadata +12 -4
data/.gitignore
CHANGED
@@ -1,7 +1,7 @@
|
|
1
|
-
.idea/
|
2
|
-
/Gemfile.lock
|
3
|
-
/pkg
|
4
|
-
/tmp
|
5
|
-
.rvmrc
|
6
|
-
*.gem
|
7
|
-
*.log
|
1
|
+
.idea/
|
2
|
+
/Gemfile.lock
|
3
|
+
/pkg
|
4
|
+
/tmp
|
5
|
+
.rvmrc
|
6
|
+
*.gem
|
7
|
+
*.log
|
data/.travis.yml
CHANGED
@@ -1,12 +1,12 @@
|
|
1
|
-
language: ruby
|
2
|
-
|
3
|
-
rvm:
|
4
|
-
- 2.0.0
|
5
|
-
- 1.9.3
|
6
|
-
|
7
|
-
gemfile:
|
8
|
-
- Gemfile
|
9
|
-
|
10
|
-
matrix:
|
11
|
-
|
12
|
-
script: bundle exec rspec
|
1
|
+
language: ruby
|
2
|
+
|
3
|
+
rvm:
|
4
|
+
- 2.0.0
|
5
|
+
- 1.9.3
|
6
|
+
|
7
|
+
gemfile:
|
8
|
+
- Gemfile
|
9
|
+
|
10
|
+
matrix:
|
11
|
+
|
12
|
+
script: bundle exec rspec
|
data/Gemfile
CHANGED
@@ -1,2 +1,2 @@
|
|
1
|
-
source "https://rubygems.org"
|
1
|
+
source "https://rubygems.org"
|
2
2
|
gemspec
|
data/LICENSE
CHANGED
@@ -1,7 +1,7 @@
|
|
1
|
-
Copyright (c) 2012-2013 Quandl
|
2
|
-
|
3
|
-
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
4
|
-
|
5
|
-
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
6
|
-
|
7
|
-
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
1
|
+
Copyright (c) 2012-2013 Quandl
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
4
|
+
|
5
|
+
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
6
|
+
|
7
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
CHANGED
@@ -1,18 +1,18 @@
|
|
1
|
-
# Quandl::Babelfish
|
2
|
-
|
3
|
-
### Installation
|
4
|
-
|
5
|
-
```ruby
|
6
|
-
|
7
|
-
|
8
|
-
gem 'quandl_babelfish'
|
9
|
-
|
10
|
-
```
|
11
|
-
|
12
|
-
|
13
|
-
### Usage
|
14
|
-
|
15
|
-
```ruby
|
16
|
-
|
17
|
-
|
18
|
-
```
|
1
|
+
# Quandl::Babelfish
|
2
|
+
|
3
|
+
### Installation
|
4
|
+
|
5
|
+
```ruby
|
6
|
+
|
7
|
+
|
8
|
+
gem 'quandl_babelfish'
|
9
|
+
|
10
|
+
```
|
11
|
+
|
12
|
+
|
13
|
+
### Usage
|
14
|
+
|
15
|
+
```ruby
|
16
|
+
|
17
|
+
|
18
|
+
```
|
data/UPGRADE.md
CHANGED
@@ -1,32 +1,39 @@
|
|
1
|
-
## 0.0.
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
*
|
25
|
-
|
26
|
-
|
27
|
-
## 0.0.
|
28
|
-
|
29
|
-
*
|
30
|
-
*
|
31
|
-
*
|
1
|
+
## 0.0.8
|
2
|
+
|
3
|
+
* QUGC-42 you should not have to include data; you might just want to update headers
|
4
|
+
|
5
|
+
|
6
|
+
## 0.0.7
|
7
|
+
|
8
|
+
* added squaring final data
|
9
|
+
|
10
|
+
|
11
|
+
## 0.0.6
|
12
|
+
|
13
|
+
* add Babelfish::Chronometer
|
14
|
+
* add Babelfish.guess_frequency
|
15
|
+
|
16
|
+
## 0.0.5
|
17
|
+
|
18
|
+
* improve error messages with line, row, context
|
19
|
+
* add Quandl::Error::Standard, all errors inherit from Error::Standard
|
20
|
+
|
21
|
+
|
22
|
+
## 0.0.4
|
23
|
+
|
24
|
+
* remove quandl_data as a dependency
|
25
|
+
|
26
|
+
|
27
|
+
## 0.0.3
|
28
|
+
|
29
|
+
* Add Quandl::Data as a add_runtime_dependency
|
30
|
+
* refactor Babelfish::Data to inherit from Quandl::Data
|
31
|
+
* refactor specs
|
32
|
+
|
33
|
+
|
34
|
+
## 0.0.1
|
35
|
+
|
36
|
+
* replace Cleaner.process return clean_array, header with Quandl::Babelfish::Data.new( clean_array, headers: header )
|
37
|
+
* refactored error
|
38
|
+
* added header extraction support
|
32
39
|
* init
|
data/lib/quandl/babelfish.rb
CHANGED
@@ -1,29 +1,29 @@
|
|
1
|
-
require "quandl/babelfish/version"
|
2
|
-
|
3
|
-
require "quandl/babelfish/helper"
|
4
|
-
require "quandl/babelfish/cleaner"
|
5
|
-
require "quandl/babelfish/date_maid"
|
6
|
-
require "quandl/babelfish/number_maid"
|
7
|
-
require "quandl/babelfish/chronometer"
|
8
|
-
|
9
|
-
require 'quandl/error/standard'
|
10
|
-
require 'quandl/error/guess_date_format'
|
11
|
-
require 'quandl/error/invalid_date'
|
12
|
-
require 'quandl/error/unknown_date_format'
|
13
|
-
|
14
|
-
module Quandl
|
15
|
-
module Babelfish
|
16
|
-
|
17
|
-
class << self
|
18
|
-
def clean(data, date_settings={}, number_settings={})
|
19
|
-
Cleaner::process data, date_settings, number_settings
|
20
|
-
end
|
21
|
-
|
22
|
-
def guess_frequency(data)
|
23
|
-
Chronometer::process data
|
24
|
-
end
|
25
|
-
|
26
|
-
end
|
27
|
-
|
28
|
-
end
|
1
|
+
require "quandl/babelfish/version"
|
2
|
+
|
3
|
+
require "quandl/babelfish/helper"
|
4
|
+
require "quandl/babelfish/cleaner"
|
5
|
+
require "quandl/babelfish/date_maid"
|
6
|
+
require "quandl/babelfish/number_maid"
|
7
|
+
require "quandl/babelfish/chronometer"
|
8
|
+
|
9
|
+
require 'quandl/error/standard'
|
10
|
+
require 'quandl/error/guess_date_format'
|
11
|
+
require 'quandl/error/invalid_date'
|
12
|
+
require 'quandl/error/unknown_date_format'
|
13
|
+
|
14
|
+
module Quandl
|
15
|
+
module Babelfish
|
16
|
+
|
17
|
+
class << self
|
18
|
+
def clean(data, date_settings={}, number_settings={})
|
19
|
+
Cleaner::process data, date_settings, number_settings
|
20
|
+
end
|
21
|
+
|
22
|
+
def guess_frequency(data)
|
23
|
+
Chronometer::process data
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
27
|
+
|
28
|
+
end
|
29
29
|
end
|
@@ -1,44 +1,44 @@
|
|
1
|
-
module Quandl
|
2
|
-
module Babelfish
|
3
|
-
|
4
|
-
class Chronometer
|
5
|
-
class << self
|
6
|
-
|
7
|
-
#return frequency and warning message if present
|
8
|
-
def process(table)
|
9
|
-
# guesses date frequency in a table
|
10
|
-
return nil if table.nil? || table.size==0
|
11
|
-
return 'daily' if table.size==1 #not enough , need more points
|
12
|
-
freqs = []
|
13
|
-
fmt = "%Y-%m"
|
14
|
-
fmt = "%Y" if table[0][0].to_s !~ /-/
|
15
|
-
fmt = "%Y-%m-%d" if table[0][0].to_s =~ /^.*-.*-.*$/
|
16
|
-
|
17
|
-
table.each_index do |r|
|
18
|
-
break if r==6 #first 6 record is enough to analyze
|
19
|
-
if table[r+1].nil?
|
20
|
-
break
|
21
|
-
else
|
22
|
-
diff = (Date.strptime(table[r+1][0].to_s, fmt) -
|
23
|
-
Date.strptime(table[r][0].to_s, fmt)).to_i.abs
|
24
|
-
if diff < 4
|
25
|
-
freqs << 'daily'
|
26
|
-
elsif diff < 10
|
27
|
-
freqs << 'weekly'
|
28
|
-
elsif diff < 60
|
29
|
-
freqs << 'monthly'
|
30
|
-
elsif diff < 200
|
31
|
-
freqs << 'quarterly'
|
32
|
-
else
|
33
|
-
freqs << 'annual'
|
34
|
-
end
|
35
|
-
end
|
36
|
-
end
|
37
|
-
return freqs.sort_by { |e| freqs.count(e) }.reverse.first#, nil
|
38
|
-
end
|
39
|
-
|
40
|
-
end
|
41
|
-
end
|
42
|
-
|
43
|
-
end
|
1
|
+
module Quandl
|
2
|
+
module Babelfish
|
3
|
+
|
4
|
+
class Chronometer
|
5
|
+
class << self
|
6
|
+
|
7
|
+
#return frequency and warning message if present
|
8
|
+
def process(table)
|
9
|
+
# guesses date frequency in a table
|
10
|
+
return nil if table.nil? || table.size==0
|
11
|
+
return 'daily' if table.size==1 #not enough , need more points
|
12
|
+
freqs = []
|
13
|
+
fmt = "%Y-%m"
|
14
|
+
fmt = "%Y" if table[0][0].to_s !~ /-/
|
15
|
+
fmt = "%Y-%m-%d" if table[0][0].to_s =~ /^.*-.*-.*$/
|
16
|
+
|
17
|
+
table.each_index do |r|
|
18
|
+
break if r==6 #first 6 record is enough to analyze
|
19
|
+
if table[r+1].nil?
|
20
|
+
break
|
21
|
+
else
|
22
|
+
diff = (Date.strptime(table[r+1][0].to_s, fmt) -
|
23
|
+
Date.strptime(table[r][0].to_s, fmt)).to_i.abs
|
24
|
+
if diff < 4
|
25
|
+
freqs << 'daily'
|
26
|
+
elsif diff < 10
|
27
|
+
freqs << 'weekly'
|
28
|
+
elsif diff < 60
|
29
|
+
freqs << 'monthly'
|
30
|
+
elsif diff < 200
|
31
|
+
freqs << 'quarterly'
|
32
|
+
else
|
33
|
+
freqs << 'annual'
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
return freqs.sort_by { |e| freqs.count(e) }.reverse.first#, nil
|
38
|
+
end
|
39
|
+
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
end
|
44
44
|
end
|
@@ -1,33 +1,34 @@
|
|
1
|
-
module Quandl
|
2
|
-
module Babelfish
|
3
|
-
|
4
|
-
class Cleaner
|
5
|
-
class << self
|
6
|
-
def process(dirty_array, date_settings={}, number_settings={})
|
7
|
-
return nil,nil if dirty_array.nil?
|
8
|
-
|
9
|
-
#check if first line is header
|
10
|
-
header=DateMaid::disinfect(dirty_array.shift) unless DateMaid::analyze_date_format(DateMaid::disinfect(dirty_array[0][0])[0])[0]
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
end
|
31
|
-
|
32
|
-
|
1
|
+
module Quandl
|
2
|
+
module Babelfish
|
3
|
+
|
4
|
+
class Cleaner
|
5
|
+
class << self
|
6
|
+
def process(dirty_array, date_settings={}, number_settings={})
|
7
|
+
return nil,nil if dirty_array.nil?
|
8
|
+
|
9
|
+
#check if first line is header
|
10
|
+
header=DateMaid::disinfect(dirty_array.shift) unless DateMaid::analyze_date_format(DateMaid::disinfect(dirty_array[0][0])[0])[0]
|
11
|
+
return [], header if dirty_array.empty?
|
12
|
+
|
13
|
+
#converts dates first
|
14
|
+
dirty_array
|
15
|
+
dates = dirty_array.collect{|x| x[0]}
|
16
|
+
DateMaid::init(date_settings)
|
17
|
+
clean_dates=DateMaid::sweep dates
|
18
|
+
|
19
|
+
clean_array=[]
|
20
|
+
#clean numbers later
|
21
|
+
NumberMaid::init(number_settings)
|
22
|
+
dirty_array.each.with_index do |row, i|
|
23
|
+
new_row=[]
|
24
|
+
(new_row << clean_dates[i]).concat Array(NumberMaid::clean(row[1..-1])) #add clean date and all clean numbers
|
25
|
+
clean_array << new_row
|
26
|
+
end
|
27
|
+
|
28
|
+
return Helper::make_square(clean_array), header
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
end
|
33
34
|
end
|
@@ -1,238 +1,238 @@
|
|
1
|
-
module Quandl
|
2
|
-
module Babelfish
|
3
|
-
|
4
|
-
#responsible for number formatting
|
5
|
-
class DateMaid
|
6
|
-
@defaults = {
|
7
|
-
:format => nil
|
8
|
-
}
|
9
|
-
|
10
|
-
@settings = @defaults #init with defaults
|
11
|
-
|
12
|
-
class << self
|
13
|
-
|
14
|
-
def init(user_settings)
|
15
|
-
@settings=@defaults.merge(user_settings)
|
16
|
-
end
|
17
|
-
|
18
|
-
#looks at all the dates and formats them to unambiguous ISO 8601 format (yyyy-mm-dd)
|
19
|
-
def sweep(all_dates)
|
20
|
-
return nil if all_dates.nil?
|
21
|
-
|
22
|
-
all_dates = disinfect all_dates
|
23
|
-
|
24
|
-
if @settings[:format].nil?
|
25
|
-
#find good example and extract all info from it and apply it to each of the dates in the set
|
26
|
-
good_sample = find_good_date(all_dates)
|
27
|
-
|
28
|
-
raise( Error::GuessDateFormat.new, "Unable to find date format for provided dates" ) if good_sample.nil?
|
29
|
-
|
30
|
-
date_format, frequency = analyze_date_format(good_sample)
|
31
|
-
|
32
|
-
|
33
|
-
else
|
34
|
-
date_format = @settings[:format]
|
35
|
-
end
|
36
|
-
|
37
|
-
iso_dates=[]
|
38
|
-
all_dates.each_with_index do |fuzzy_date, i|
|
39
|
-
temp_date = convert(fuzzy_date, date_format) rescue raise( Error::InvalidDate.new( line: i+1, row: fuzzy_date, context: 'convert' ), "Invalid date '#{fuzzy_date}'" )
|
40
|
-
iso_dates << frequency_transform(temp_date, frequency)
|
41
|
-
end
|
42
|
-
|
43
|
-
iso_dates
|
44
|
-
end
|
45
|
-
|
46
|
-
def analyze_date_format(example)
|
47
|
-
return nil if example.nil?
|
48
|
-
|
49
|
-
# Regular formats and Custom formats (where Date.parse and Date.strptime
|
50
|
-
# fear to tread)
|
51
|
-
if re = example.match(/^(\d{1,2})\D(\d{1,2})\D\d{4}/) # eg "07/03/2012"
|
52
|
-
if re[1].to_i > 12
|
53
|
-
return '%d-%m-%Y', nil
|
54
|
-
else
|
55
|
-
return '%m-%d-%Y', nil
|
56
|
-
end
|
57
|
-
end
|
58
|
-
if re = example.match(/^(\d{1,2})\D(\d{1,2})\D\d{2}/) # eg "07/03/12"
|
59
|
-
if re[1].to_i > 12
|
60
|
-
return '%d-%m-%y', nil
|
61
|
-
else
|
62
|
-
return '%m-%d-%y', nil
|
63
|
-
end
|
64
|
-
end
|
65
|
-
# order these guys from most specific to most general
|
66
|
-
return "%Y", "annual" if example =~ /^\d{4}[\s]?-[\s]?\d{4}$/
|
67
|
-
return '%Y%m%d', 'daily' if example =~ /^\d{8}$/ && example[4..5].to_i < 13 && example[6..7].to_i < 32 # precisely 8 digits - yyyymmdd
|
68
|
-
return 'epoch', 'daily' if example =~ /^\d{7}.*$/ # 7 or more digits - epoch
|
69
|
-
return '%Y', 'annual' if example =~ /^\d{4}$/ # 4 digits
|
70
|
-
return '%Y', 'annual' if example =~ /^\d{4}\.0$/ # 4 digits with a dot 0 for excel
|
71
|
-
return ':year_quarter', 'quarterly' if example =~ /^\d{4}[Qq]\d$/ # 4 digits, Q, digit (here because the next pattern would override it)
|
72
|
-
return '%YM%m', 'monthly' if example =~ /^\d{4}M\d{1,2}$/ # 2007M08
|
73
|
-
return '%GW%V', 'weekly' if example =~ /^\d{4}W\d{1,2}$/ # 2012W01
|
74
|
-
return '%Y-%m', 'monthly' if example =~ /^\d{4}\D\d{1,2}$/ # 4 digits, separator, 1-2 digits
|
75
|
-
return '%m-%Y', 'monthly' if example =~ /^\d{1,2}\D\d{4}$/ # 1-2 digits, separator, 4 digits
|
76
|
-
return '%Y%m', 'monthly' if example =~ /^\d{6}$/ # 6 digits
|
77
|
-
return '%Y-%b', 'monthly' if example =~ /^\d{4}\D\w{3}$/ # 4 digits, separator, 3 letters
|
78
|
-
return '%b-%Y', 'monthly' if example =~ /^\w{3}\D\d{4}$/ # 3 letters, separator, 4 digits
|
79
|
-
return '%b-%y', 'monthly' if example =~ /^\w{3}\D\d{2}$/ # 3 letters, separator, 2 digits
|
80
|
-
return '%Y%b', 'monthly' if example =~ /^\d{4}\w{3}$/ # 4 digits, 3 letters
|
81
|
-
return '%b%Y', 'monthly' if example =~ /^\w{3}\d{4}$/ # 3 letters, 4 digits
|
82
|
-
return '%Y-%b-%d', 'daily' if example =~ /^\d{4}\D\w{3}\D\d{1,2}$/ # 4 digits, separator, 3 letters, separator, 1-2 digits
|
83
|
-
return '%Y-%m-%d', 'daily' if example =~ /^\d{4}\D\d{1,2}\D\d{1,2}$/ # 4 digits, separator, 1-2 digits, separator, 1-2 digits
|
84
|
-
return '%d-%b-%Y', 'daily' if example =~ /^\d{1,2}\D\w{3}\D\d{4}$/ # 1-2 digits, separator, 3 letters, separator, 4 digits
|
85
|
-
return '%Y%b%d', 'daily' if example =~ /^\d{4}\w{3}\d{1,2}$/ # 4 digits, 3 letters, 1-2 digits
|
86
|
-
return '%d%b%Y', 'daily' if example =~ /^\d{1,2}\w{3}\d{4}$/ # 1-2 digits, 3 letters, 4 digits
|
87
|
-
return '%d-%b-%y', 'daily' if example =~ /^\d{1,2}\D\w{3}\D\d{2}$/ # 1-2 digits, 3 letters, 2 digits
|
88
|
-
return '%b-%d-%Y', 'daily' if example =~ /^\w{3}\D\d{1,2}\D{1,2}\d{4}$/ # 3 letters, separator, 1-2 digits, separator(s), 4 digits
|
89
|
-
|
90
|
-
#our custom formats
|
91
|
-
return ':year_quarter', 'quarterly' if example =~ /^\d{4}\D[Qq]\d$/ # 4 digits, separator, Q, digit
|
92
|
-
return ':excel-1900', 'daily' if example =~ /^\d{5}$/ # 5 digits
|
93
|
-
return ':excel-1900', 'daily' if example =~ /^\d{5}\.0$/ # 5 digits dot zero excel
|
94
|
-
|
95
|
-
# No, try default date parse
|
96
|
-
# raise PostProcessorException, "Unable to guess date format for #{example}"
|
97
|
-
[nil, nil]
|
98
|
-
end
|
99
|
-
|
100
|
-
def disinfect(dates)
|
101
|
-
[*dates].collect do |date|
|
102
|
-
date.to_s.encode!('UTF-8', 'UTF-8', :invalid => :replace)
|
103
|
-
date.to_s.gsub!(/[^\x01-\x7f]/,'')
|
104
|
-
date.to_s.strip.gsub(/\s\s+/, ' ')
|
105
|
-
end
|
106
|
-
end
|
107
|
-
private
|
108
|
-
|
109
|
-
|
110
|
-
#converts date to specified format
|
111
|
-
def convert(fuzzy_date, date_format)
|
112
|
-
if date_format.nil?
|
113
|
-
# Assuming a US date format with 3 parameters (i.e. MM?DD?YYYY)
|
114
|
-
tokens = fuzzy_date.split(/\D/)
|
115
|
-
if tokens[0].length > 2 || fuzzy_date =~ /\w{2}/
|
116
|
-
# Its ISO
|
117
|
-
return DateTime.parse(fuzzy_date.to_s).to_date
|
118
|
-
else
|
119
|
-
# Guessing US
|
120
|
-
return Date.new(tokens[2].to_i, tokens[0].to_i, tokens[1].to_i)
|
121
|
-
end
|
122
|
-
else
|
123
|
-
case date_format
|
124
|
-
when ':year_quarter'
|
125
|
-
return year_quarter_formatter(fuzzy_date)
|
126
|
-
when ':excel-1900'
|
127
|
-
return excel_1900_formatter(fuzzy_date)
|
128
|
-
else #regular ruby formatter
|
129
|
-
return regular_formatter(fuzzy_date, date_format)
|
130
|
-
end
|
131
|
-
|
132
|
-
end
|
133
|
-
end
|
134
|
-
|
135
|
-
|
136
|
-
def year_quarter_formatter(fuzzy_date)
|
137
|
-
raw_date = fuzzy_date
|
138
|
-
tokens = raw_date.gsub(/[qQ]/, '-').gsub(/[a-zA-Z]/, '').split(/[^0-9]/)
|
139
|
-
tokens.delete_if {|x| x.nil? || x.empty?} # In case there are more than one delimiter because we replaced the Q
|
140
|
-
Date.new(tokens[0].to_i, tokens[1].to_i * 3, 1)
|
141
|
-
end
|
142
|
-
|
143
|
-
def excel_1900_formatter(fuzzy_date)
|
144
|
-
# handle Lotus 123 bug has 1900 as a leap year
|
145
|
-
Date.civil(1899, 12, 31) + fuzzy_date.to_i - 1 if fuzzy_date.to_i > 0
|
146
|
-
end
|
147
|
-
|
148
|
-
def regular_formatter(fuzzy_date, date_format)
|
149
|
-
# We have a date format - oh so pretty, but...
|
150
|
-
date_string = fuzzy_date
|
151
|
-
# normalize delimiters to hyphens so we do not have to make a format for each one.
|
152
|
-
# delimiters can be letters when its all numbers and delimiters only when there are letters. Sigh.
|
153
|
-
# only if no format where provided
|
154
|
-
date_string = date_string.gsub(/[^\d\w]+/, '-') if @settings[:format].nil?
|
155
|
-
|
156
|
-
#epoch date string
|
157
|
-
if date_format == 'epoch'
|
158
|
-
news = Time.at(date_string.to_i).utc.to_s.match(/\d\d\d\d-\d\d-\d\d/)
|
159
|
-
formatted_date = DateTime.strptime(news.to_s, '%Y-%m-%d').to_date
|
160
|
-
else
|
161
|
-
if date_string.to_s =~ /^(\w{3})\D(\d{2})$/
|
162
|
-
century = $2.to_i < 25 ? '20' : '19'
|
163
|
-
date_string = "#{$1} #{century}#{$2}"
|
164
|
-
formatted_date = DateTime.strptime(date_string.to_s, '%b %Y').to_date
|
165
|
-
else
|
166
|
-
formatted_date = DateTime.strptime(date_string.to_s, date_format).to_date
|
167
|
-
end
|
168
|
-
end
|
169
|
-
formatted_date+=4 if date_format == '%GW%V' #strptime makes dates on Mondays. We want Fridays.
|
170
|
-
formatted_date
|
171
|
-
end
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
#find good example of date to use as template for format
|
176
|
-
def find_good_date(all_dates)
|
177
|
-
good_sample=nil
|
178
|
-
all_dates.each do |fuzzy_date|
|
179
|
-
if usable_cell(fuzzy_date)
|
180
|
-
good_sample = fuzzy_date
|
181
|
-
break
|
182
|
-
end
|
183
|
-
end
|
184
|
-
good_sample
|
185
|
-
end
|
186
|
-
|
187
|
-
def usable_cell(cell)
|
188
|
-
return false if cell.nil? || cell.to_s.empty?
|
189
|
-
return false if cell.to_s.size > 20 # even annotated date can't be bigger than 20
|
190
|
-
|
191
|
-
return true if cell.to_s =~ /^\w{3}\D[456789]\d$/
|
192
|
-
# date is not usable as an example if it is ambiguous as to day and month
|
193
|
-
# 03/04/2012, for example, is ambiguous. 03/17/2012 is NOT ambiguous
|
194
|
-
if re = cell.to_s.match(/^(\d{1,2})\D(\d{1,2})\D\d{2,4}/) # e.g. 03/04/2012
|
195
|
-
if re[1].to_i <= 12 and re[2].to_i <= 12
|
196
|
-
return false
|
197
|
-
else
|
198
|
-
return true
|
199
|
-
end
|
200
|
-
end
|
201
|
-
|
202
|
-
if re = cell.to_s.match(/^(\d{1,2})\D\w{3}\D(\d{2})/) # 07-jun-07
|
203
|
-
if re[1].to_i <= 12 and re[2].to_i <= 12
|
204
|
-
return false
|
205
|
-
else
|
206
|
-
return true
|
207
|
-
end
|
208
|
-
end
|
209
|
-
|
210
|
-
return true if cell.to_s =~ /\d{4}/ # It has a 4 digit year somewhere
|
211
|
-
|
212
|
-
return true if cell.to_s =~ /^\w{3}\D\d{2}/ # %b-%y(d)..also not ambiguous
|
213
|
-
|
214
|
-
false # Thank you, come again
|
215
|
-
end
|
216
|
-
|
217
|
-
# Bump date to the end of the respective periods
|
218
|
-
def frequency_transform(date, frequency)
|
219
|
-
case frequency
|
220
|
-
when 'annual'
|
221
|
-
date = Date.new(date.year,12,31)
|
222
|
-
when 'quarterly'
|
223
|
-
month = 3*((date.month-1)/3 + 1) # equals 3,6,9 or 12
|
224
|
-
date = Date.new(date.year, month, 1).next_month-1
|
225
|
-
when 'monthly'
|
226
|
-
date = Date.new(date.year, date.month,1).next_month-1
|
227
|
-
else
|
228
|
-
# Do nothing for daily or weekly
|
229
|
-
end
|
230
|
-
|
231
|
-
date
|
232
|
-
end
|
233
|
-
|
234
|
-
|
235
|
-
end
|
236
|
-
end
|
237
|
-
end
|
1
|
+
module Quandl
|
2
|
+
module Babelfish
|
3
|
+
|
4
|
+
#responsible for number formatting
|
5
|
+
class DateMaid
|
6
|
+
@defaults = {
|
7
|
+
:format => nil
|
8
|
+
}
|
9
|
+
|
10
|
+
@settings = @defaults #init with defaults
|
11
|
+
|
12
|
+
class << self
|
13
|
+
|
14
|
+
def init(user_settings)
|
15
|
+
@settings=@defaults.merge(user_settings)
|
16
|
+
end
|
17
|
+
|
18
|
+
#looks at all the dates and formats them to unambiguous ISO 8601 format (yyyy-mm-dd)
|
19
|
+
def sweep(all_dates)
|
20
|
+
return nil if all_dates.nil?
|
21
|
+
|
22
|
+
all_dates = disinfect all_dates
|
23
|
+
|
24
|
+
if @settings[:format].nil?
|
25
|
+
#find good example and extract all info from it and apply it to each of the dates in the set
|
26
|
+
good_sample = find_good_date(all_dates)
|
27
|
+
|
28
|
+
raise( Error::GuessDateFormat.new, "Unable to find date format for provided dates" ) if good_sample.nil?
|
29
|
+
|
30
|
+
date_format, frequency = analyze_date_format(good_sample)
|
31
|
+
|
32
|
+
|
33
|
+
else
|
34
|
+
date_format = @settings[:format]
|
35
|
+
end
|
36
|
+
|
37
|
+
iso_dates=[]
|
38
|
+
all_dates.each_with_index do |fuzzy_date, i|
|
39
|
+
temp_date = convert(fuzzy_date, date_format) rescue raise( Error::InvalidDate.new( line: i+1, row: fuzzy_date, context: 'convert' ), "Invalid date '#{fuzzy_date}'" )
|
40
|
+
iso_dates << frequency_transform(temp_date, frequency)
|
41
|
+
end
|
42
|
+
|
43
|
+
iso_dates
|
44
|
+
end
|
45
|
+
|
46
|
+
def analyze_date_format(example)
|
47
|
+
return nil if example.nil?
|
48
|
+
|
49
|
+
# Regular formats and Custom formats (where Date.parse and Date.strptime
|
50
|
+
# fear to tread)
|
51
|
+
if re = example.match(/^(\d{1,2})\D(\d{1,2})\D\d{4}/) # eg "07/03/2012"
|
52
|
+
if re[1].to_i > 12
|
53
|
+
return '%d-%m-%Y', nil
|
54
|
+
else
|
55
|
+
return '%m-%d-%Y', nil
|
56
|
+
end
|
57
|
+
end
|
58
|
+
if re = example.match(/^(\d{1,2})\D(\d{1,2})\D\d{2}/) # eg "07/03/12"
|
59
|
+
if re[1].to_i > 12
|
60
|
+
return '%d-%m-%y', nil
|
61
|
+
else
|
62
|
+
return '%m-%d-%y', nil
|
63
|
+
end
|
64
|
+
end
|
65
|
+
# order these guys from most specific to most general
|
66
|
+
return "%Y", "annual" if example =~ /^\d{4}[\s]?-[\s]?\d{4}$/
|
67
|
+
return '%Y%m%d', 'daily' if example =~ /^\d{8}$/ && example[4..5].to_i < 13 && example[6..7].to_i < 32 # precisely 8 digits - yyyymmdd
|
68
|
+
return 'epoch', 'daily' if example =~ /^\d{7}.*$/ # 7 or more digits - epoch
|
69
|
+
return '%Y', 'annual' if example =~ /^\d{4}$/ # 4 digits
|
70
|
+
return '%Y', 'annual' if example =~ /^\d{4}\.0$/ # 4 digits with a dot 0 for excel
|
71
|
+
return ':year_quarter', 'quarterly' if example =~ /^\d{4}[Qq]\d$/ # 4 digits, Q, digit (here because the next pattern would override it)
|
72
|
+
return '%YM%m', 'monthly' if example =~ /^\d{4}M\d{1,2}$/ # 2007M08
|
73
|
+
return '%GW%V', 'weekly' if example =~ /^\d{4}W\d{1,2}$/ # 2012W01
|
74
|
+
return '%Y-%m', 'monthly' if example =~ /^\d{4}\D\d{1,2}$/ # 4 digits, separator, 1-2 digits
|
75
|
+
return '%m-%Y', 'monthly' if example =~ /^\d{1,2}\D\d{4}$/ # 1-2 digits, separator, 4 digits
|
76
|
+
return '%Y%m', 'monthly' if example =~ /^\d{6}$/ # 6 digits
|
77
|
+
return '%Y-%b', 'monthly' if example =~ /^\d{4}\D\w{3}$/ # 4 digits, separator, 3 letters
|
78
|
+
return '%b-%Y', 'monthly' if example =~ /^\w{3}\D\d{4}$/ # 3 letters, separator, 4 digits
|
79
|
+
return '%b-%y', 'monthly' if example =~ /^\w{3}\D\d{2}$/ # 3 letters, separator, 2 digits
|
80
|
+
return '%Y%b', 'monthly' if example =~ /^\d{4}\w{3}$/ # 4 digits, 3 letters
|
81
|
+
return '%b%Y', 'monthly' if example =~ /^\w{3}\d{4}$/ # 3 letters, 4 digits
|
82
|
+
return '%Y-%b-%d', 'daily' if example =~ /^\d{4}\D\w{3}\D\d{1,2}$/ # 4 digits, separator, 3 letters, separator, 1-2 digits
|
83
|
+
return '%Y-%m-%d', 'daily' if example =~ /^\d{4}\D\d{1,2}\D\d{1,2}$/ # 4 digits, separator, 1-2 digits, separator, 1-2 digits
|
84
|
+
return '%d-%b-%Y', 'daily' if example =~ /^\d{1,2}\D\w{3}\D\d{4}$/ # 1-2 digits, separator, 3 letters, separator, 4 digits
|
85
|
+
return '%Y%b%d', 'daily' if example =~ /^\d{4}\w{3}\d{1,2}$/ # 4 digits, 3 letters, 1-2 digits
|
86
|
+
return '%d%b%Y', 'daily' if example =~ /^\d{1,2}\w{3}\d{4}$/ # 1-2 digits, 3 letters, 4 digits
|
87
|
+
return '%d-%b-%y', 'daily' if example =~ /^\d{1,2}\D\w{3}\D\d{2}$/ # 1-2 digits, 3 letters, 2 digits
|
88
|
+
return '%b-%d-%Y', 'daily' if example =~ /^\w{3}\D\d{1,2}\D{1,2}\d{4}$/ # 3 letters, separator, 1-2 digits, separator(s), 4 digits
|
89
|
+
|
90
|
+
#our custom formats
|
91
|
+
return ':year_quarter', 'quarterly' if example =~ /^\d{4}\D[Qq]\d$/ # 4 digits, separator, Q, digit
|
92
|
+
return ':excel-1900', 'daily' if example =~ /^\d{5}$/ # 5 digits
|
93
|
+
return ':excel-1900', 'daily' if example =~ /^\d{5}\.0$/ # 5 digits dot zero excel
|
94
|
+
|
95
|
+
# No, try default date parse
|
96
|
+
# raise PostProcessorException, "Unable to guess date format for #{example}"
|
97
|
+
[nil, nil]
|
98
|
+
end
|
99
|
+
|
100
|
+
def disinfect(dates)
|
101
|
+
[*dates].collect do |date|
|
102
|
+
date.to_s.encode!('UTF-8', 'UTF-8', :invalid => :replace)
|
103
|
+
date.to_s.gsub!(/[^\x01-\x7f]/,'')
|
104
|
+
date.to_s.strip.gsub(/\s\s+/, ' ')
|
105
|
+
end
|
106
|
+
end
|
107
|
+
private
|
108
|
+
|
109
|
+
|
110
|
+
#converts date to specified format
|
111
|
+
def convert(fuzzy_date, date_format)
|
112
|
+
if date_format.nil?
|
113
|
+
# Assuming a US date format with 3 parameters (i.e. MM?DD?YYYY)
|
114
|
+
tokens = fuzzy_date.split(/\D/)
|
115
|
+
if tokens[0].length > 2 || fuzzy_date =~ /\w{2}/
|
116
|
+
# Its ISO
|
117
|
+
return DateTime.parse(fuzzy_date.to_s).to_date
|
118
|
+
else
|
119
|
+
# Guessing US
|
120
|
+
return Date.new(tokens[2].to_i, tokens[0].to_i, tokens[1].to_i)
|
121
|
+
end
|
122
|
+
else
|
123
|
+
case date_format
|
124
|
+
when ':year_quarter'
|
125
|
+
return year_quarter_formatter(fuzzy_date)
|
126
|
+
when ':excel-1900'
|
127
|
+
return excel_1900_formatter(fuzzy_date)
|
128
|
+
else #regular ruby formatter
|
129
|
+
return regular_formatter(fuzzy_date, date_format)
|
130
|
+
end
|
131
|
+
|
132
|
+
end
|
133
|
+
end
|
134
|
+
|
135
|
+
|
136
|
+
def year_quarter_formatter(fuzzy_date)
|
137
|
+
raw_date = fuzzy_date
|
138
|
+
tokens = raw_date.gsub(/[qQ]/, '-').gsub(/[a-zA-Z]/, '').split(/[^0-9]/)
|
139
|
+
tokens.delete_if {|x| x.nil? || x.empty?} # In case there are more than one delimiter because we replaced the Q
|
140
|
+
Date.new(tokens[0].to_i, tokens[1].to_i * 3, 1)
|
141
|
+
end
|
142
|
+
|
143
|
+
def excel_1900_formatter(fuzzy_date)
|
144
|
+
# handle Lotus 123 bug has 1900 as a leap year
|
145
|
+
Date.civil(1899, 12, 31) + fuzzy_date.to_i - 1 if fuzzy_date.to_i > 0
|
146
|
+
end
|
147
|
+
|
148
|
+
def regular_formatter(fuzzy_date, date_format)
|
149
|
+
# We have a date format - oh so pretty, but...
|
150
|
+
date_string = fuzzy_date
|
151
|
+
# normalize delimiters to hyphens so we do not have to make a format for each one.
|
152
|
+
# delimiters can be letters when its all numbers and delimiters only when there are letters. Sigh.
|
153
|
+
# only if no format where provided
|
154
|
+
date_string = date_string.gsub(/[^\d\w]+/, '-') if @settings[:format].nil?
|
155
|
+
|
156
|
+
#epoch date string
|
157
|
+
if date_format == 'epoch'
|
158
|
+
news = Time.at(date_string.to_i).utc.to_s.match(/\d\d\d\d-\d\d-\d\d/)
|
159
|
+
formatted_date = DateTime.strptime(news.to_s, '%Y-%m-%d').to_date
|
160
|
+
else
|
161
|
+
if date_string.to_s =~ /^(\w{3})\D(\d{2})$/
|
162
|
+
century = $2.to_i < 25 ? '20' : '19'
|
163
|
+
date_string = "#{$1} #{century}#{$2}"
|
164
|
+
formatted_date = DateTime.strptime(date_string.to_s, '%b %Y').to_date
|
165
|
+
else
|
166
|
+
formatted_date = DateTime.strptime(date_string.to_s, date_format).to_date
|
167
|
+
end
|
168
|
+
end
|
169
|
+
formatted_date+=4 if date_format == '%GW%V' #strptime makes dates on Mondays. We want Fridays.
|
170
|
+
formatted_date
|
171
|
+
end
|
172
|
+
|
173
|
+
|
174
|
+
|
175
|
+
#find good example of date to use as template for format
|
176
|
+
def find_good_date(all_dates)
|
177
|
+
good_sample=nil
|
178
|
+
all_dates.each do |fuzzy_date|
|
179
|
+
if usable_cell(fuzzy_date)
|
180
|
+
good_sample = fuzzy_date
|
181
|
+
break
|
182
|
+
end
|
183
|
+
end
|
184
|
+
good_sample
|
185
|
+
end
|
186
|
+
|
187
|
+
def usable_cell(cell)
|
188
|
+
return false if cell.nil? || cell.to_s.empty?
|
189
|
+
return false if cell.to_s.size > 20 # even annotated date can't be bigger than 20
|
190
|
+
|
191
|
+
return true if cell.to_s =~ /^\w{3}\D[456789]\d$/
|
192
|
+
# date is not usable as an example if it is ambiguous as to day and month
|
193
|
+
# 03/04/2012, for example, is ambiguous. 03/17/2012 is NOT ambiguous
|
194
|
+
if re = cell.to_s.match(/^(\d{1,2})\D(\d{1,2})\D\d{2,4}/) # e.g. 03/04/2012
|
195
|
+
if re[1].to_i <= 12 and re[2].to_i <= 12
|
196
|
+
return false
|
197
|
+
else
|
198
|
+
return true
|
199
|
+
end
|
200
|
+
end
|
201
|
+
|
202
|
+
if re = cell.to_s.match(/^(\d{1,2})\D\w{3}\D(\d{2})/) # 07-jun-07
|
203
|
+
if re[1].to_i <= 12 and re[2].to_i <= 12
|
204
|
+
return false
|
205
|
+
else
|
206
|
+
return true
|
207
|
+
end
|
208
|
+
end
|
209
|
+
|
210
|
+
return true if cell.to_s =~ /\d{4}/ # It has a 4 digit year somewhere
|
211
|
+
|
212
|
+
return true if cell.to_s =~ /^\w{3}\D\d{2}/ # %b-%y(d)..also not ambiguous
|
213
|
+
|
214
|
+
false # Thank you, come again
|
215
|
+
end
|
216
|
+
|
217
|
+
# Bump date to the end of the respective periods
|
218
|
+
def frequency_transform(date, frequency)
|
219
|
+
case frequency
|
220
|
+
when 'annual'
|
221
|
+
date = Date.new(date.year,12,31)
|
222
|
+
when 'quarterly'
|
223
|
+
month = 3*((date.month-1)/3 + 1) # equals 3,6,9 or 12
|
224
|
+
date = Date.new(date.year, month, 1).next_month-1
|
225
|
+
when 'monthly'
|
226
|
+
date = Date.new(date.year, date.month,1).next_month-1
|
227
|
+
else
|
228
|
+
# Do nothing for daily or weekly
|
229
|
+
end
|
230
|
+
|
231
|
+
date
|
232
|
+
end
|
233
|
+
|
234
|
+
|
235
|
+
end
|
236
|
+
end
|
237
|
+
end
|
238
238
|
end
|