yannitor 0.5.0 → 0.6.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/lib/yannitor/cleaner.rb +39 -35
- data/lib/yannitor/version.rb +1 -1
- data/lib/yannitor.rb +4 -3
- data/yannitor.gemspec +3 -11
- metadata +19 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 3aa7e3ec52c46173c2b250128ae91232b56f5f77128b96076e03a4bb754d7ebc
|
4
|
+
data.tar.gz: 96ba16f09a7d85186d97228758f4a2ff91f996f45204f05f82b6fb1c1b741c4f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f786eaffca03c775ab53f0931ce45ca1c66896132e9a3d052547d4a71711d32ef044378370b077b4bdfcbeea79f939ee002ff18f8fe821367113de0aa694f7d8
|
7
|
+
data.tar.gz: f87a486ac25649dab0f2e6ffb10e3030061d6e6d96c12e938fa4e2efb0e9f43b2b12720dcb29ccf5befc26cb78d28b5bbe135e60a430eeb84dac970e7f552208
|
data/lib/yannitor/cleaner.rb
CHANGED
@@ -1,59 +1,64 @@
|
|
1
|
-
|
2
|
-
# require 'active_record/version'
|
3
|
-
# require 'active_support/core_ext/module'
|
1
|
+
# frozen_string_literal: true
|
4
2
|
|
5
|
-
|
6
|
-
# require 'rails/engine'
|
7
|
-
# end
|
3
|
+
require 'active_record'
|
8
4
|
|
9
5
|
module Yannitor
|
10
6
|
module Broom
|
11
|
-
attr_accessor :
|
7
|
+
attr_accessor :yannitor_features
|
12
8
|
|
13
9
|
def yannitor_is_cleaning(feats = {})
|
14
|
-
self.
|
10
|
+
self.yannitor_features = feats
|
15
11
|
end
|
16
12
|
|
17
|
-
def to_one_hot
|
18
|
-
sorted_value_array =
|
19
|
-
|
20
|
-
|
21
|
-
values_select = %
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
13
|
+
def to_one_hot(target_column, type = 'text')
|
14
|
+
sorted_value_array = pluck("distinct(#{target_column})").join("'), ('")
|
15
|
+
|
16
|
+
table_name = self.table_name
|
17
|
+
values_select = %(
|
18
|
+
SELECT value FROM (values ('#{sorted_value_array}')) s(value)
|
19
|
+
)
|
20
|
+
|
21
|
+
self.select(%(
|
22
|
+
#{table_name}.id,
|
23
|
+
ARRAY_AGG(CASE
|
24
|
+
WHEN sorted_value_table.value::#{type} = #{table_name}.#{target_column}::#{type}
|
25
|
+
THEN 1
|
26
|
+
ELSE 0
|
27
|
+
END
|
28
|
+
) AS o#{target_column}
|
29
|
+
)).joins(%(
|
27
30
|
LEFT JOIN (#{values_select}) AS sorted_value_table ON 1=1
|
28
|
-
)).group("#{
|
31
|
+
)).group("#{table_name}.id")
|
29
32
|
end
|
30
33
|
|
31
34
|
def vectorize
|
32
|
-
|
33
|
-
|
34
|
-
select('*, ' + features[:linear].map do |feature|
|
35
|
-
min = all.minimum(feature)
|
36
|
-
max = all.maximum(feature)
|
37
|
-
"CAST((#{_table_name}.#{feature}::float - #{min}::float) / (#{max}::float - #{min}::float) AS float) as n#{feature}"
|
38
|
-
end.join(', ')).all.map do |obj|
|
35
|
+
select('*, ' + linear_feature_select).build_linear_features
|
36
|
+
end
|
39
37
|
|
40
|
-
|
38
|
+
def build_linear_features
|
39
|
+
all.map do |obj|
|
40
|
+
obj.class.yannitor_features[:linear].map do |feature|
|
41
41
|
obj.send("n#{feature}").to_f
|
42
42
|
end
|
43
43
|
end
|
44
44
|
end
|
45
45
|
|
46
|
-
def
|
47
|
-
|
46
|
+
def linear_feature_select
|
47
|
+
yannitor_features[:linear].map do |feature|
|
48
|
+
min = all.minimum(feature)
|
49
|
+
max = all.maximum(feature)
|
50
|
+
"CAST((#{table_name}.#{feature}::float - #{min}::float) / (#{max}::float - #{min}::float) AS float) as n#{feature}"
|
51
|
+
end.join(', ')
|
52
|
+
end
|
48
53
|
|
54
|
+
def nelect(feature)
|
49
55
|
min = all.minimum(feature)
|
50
56
|
max = all.maximum(feature)
|
51
|
-
|
52
|
-
|
57
|
+
|
58
|
+
select("*, (#{table_name}.#{feature}::float - #{min}::float) / (#{max}::float - #{min}::float)::float as n#{feature}")
|
53
59
|
end
|
54
60
|
|
55
61
|
def normalize(feature)
|
56
|
-
print "Normalizing #{feature}"
|
57
62
|
min = all.minimum(feature)
|
58
63
|
max = all.maximum(feature)
|
59
64
|
data = all.nelect(feature).map do |e|
|
@@ -63,14 +68,13 @@ module Yannitor
|
|
63
68
|
[data, min, max]
|
64
69
|
end
|
65
70
|
|
66
|
-
def to_file
|
67
|
-
CSV.open(
|
71
|
+
def to_file(file_name = 'data.csv', separator = ' ')
|
72
|
+
CSV.open(file_name, 'wb', col_sep: separator) do |csv|
|
68
73
|
all.vectorize.each { |v| csv << v }
|
69
74
|
end
|
70
75
|
|
71
76
|
nil
|
72
77
|
end
|
73
|
-
|
74
78
|
end
|
75
79
|
end
|
76
80
|
|
data/lib/yannitor/version.rb
CHANGED
data/lib/yannitor.rb
CHANGED
data/yannitor.gemspec
CHANGED
@@ -7,21 +7,12 @@ Gem::Specification.new do |spec|
|
|
7
7
|
spec.name = "yannitor"
|
8
8
|
spec.version = Yannitor::VERSION
|
9
9
|
spec.authors = ["Danielius Visockas"]
|
10
|
-
spec.email = ["
|
10
|
+
spec.email = ["danieliusvisockas@gmail.com"]
|
11
11
|
|
12
|
-
spec.summary = %q{
|
12
|
+
spec.summary = %q{Helps you build one-hot or min-max encoded vectors from ActiveRecord collections}
|
13
13
|
spec.description = %q{I'll clean your data}
|
14
14
|
spec.homepage = "https://github.com"
|
15
15
|
|
16
|
-
# Prevent pushing this gem to RubyGems.org. To allow pushes either set the 'allowed_push_host'
|
17
|
-
# to allow pushing to a single host or delete this section to allow pushing to any host.
|
18
|
-
# if spec.respond_to?(:metadata)
|
19
|
-
# spec.metadata['allowed_push_host'] = "'http://mygemserver.com'"
|
20
|
-
# else
|
21
|
-
# raise "RubyGems 2.0 or newer is required to protect against " \
|
22
|
-
# "public gem pushes."
|
23
|
-
# end
|
24
|
-
|
25
16
|
spec.files = `git ls-files -z`.split("\x0").reject do |f|
|
26
17
|
f.match(%r{^(test|spec|features)/})
|
27
18
|
end
|
@@ -29,6 +20,7 @@ Gem::Specification.new do |spec|
|
|
29
20
|
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
30
21
|
spec.require_paths = ["lib"]
|
31
22
|
|
23
|
+
spec.add_runtime_dependency "active_record", ["> 3.2.0"]
|
32
24
|
spec.add_development_dependency "bundler", "~> 1.14"
|
33
25
|
spec.add_development_dependency "rake", "~> 10.0"
|
34
26
|
spec.add_development_dependency "rspec", "~> 3.0"
|
metadata
CHANGED
@@ -1,15 +1,29 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: yannitor
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.6.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Danielius Visockas
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2019-04-05 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: active_record
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ">"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: 3.2.0
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ">"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: 3.2.0
|
13
27
|
- !ruby/object:Gem::Dependency
|
14
28
|
name: bundler
|
15
29
|
requirement: !ruby/object:Gem::Requirement
|
@@ -54,7 +68,7 @@ dependencies:
|
|
54
68
|
version: '3.0'
|
55
69
|
description: I'll clean your data
|
56
70
|
email:
|
57
|
-
-
|
71
|
+
- danieliusvisockas@gmail.com
|
58
72
|
executables: []
|
59
73
|
extensions: []
|
60
74
|
extra_rdoc_files: []
|
@@ -91,8 +105,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
91
105
|
version: '0'
|
92
106
|
requirements: []
|
93
107
|
rubyforge_project:
|
94
|
-
rubygems_version: 2.
|
108
|
+
rubygems_version: 2.7.8
|
95
109
|
signing_key:
|
96
110
|
specification_version: 4
|
97
|
-
summary:
|
111
|
+
summary: Helps you build one-hot or min-max encoded vectors from ActiveRecord collections
|
98
112
|
test_files: []
|