clusterkit 0.1.0.pre.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rspec +3 -0
- data/.simplecov +47 -0
- data/CHANGELOG.md +35 -0
- data/CLAUDE.md +226 -0
- data/Cargo.toml +8 -0
- data/Gemfile +17 -0
- data/IMPLEMENTATION_NOTES.md +143 -0
- data/LICENSE.txt +21 -0
- data/PYTHON_COMPARISON.md +183 -0
- data/README.md +499 -0
- data/Rakefile +245 -0
- data/clusterkit.gemspec +45 -0
- data/docs/KNOWN_ISSUES.md +130 -0
- data/docs/RUST_ERROR_HANDLING.md +164 -0
- data/docs/TEST_FIXTURES.md +170 -0
- data/docs/UMAP_EXPLAINED.md +362 -0
- data/docs/UMAP_TROUBLESHOOTING.md +284 -0
- data/docs/VERBOSE_OUTPUT.md +84 -0
- data/examples/hdbscan_example.rb +147 -0
- data/examples/optimal_kmeans_example.rb +96 -0
- data/examples/pca_example.rb +114 -0
- data/examples/reproducible_umap.rb +99 -0
- data/examples/verbose_control.rb +43 -0
- data/ext/clusterkit/Cargo.toml +25 -0
- data/ext/clusterkit/extconf.rb +4 -0
- data/ext/clusterkit/src/clustering/hdbscan_wrapper.rs +115 -0
- data/ext/clusterkit/src/clustering.rs +267 -0
- data/ext/clusterkit/src/embedder.rs +413 -0
- data/ext/clusterkit/src/lib.rs +22 -0
- data/ext/clusterkit/src/svd.rs +112 -0
- data/ext/clusterkit/src/tests.rs +16 -0
- data/ext/clusterkit/src/utils.rs +33 -0
- data/lib/clusterkit/clustering/hdbscan.rb +177 -0
- data/lib/clusterkit/clustering.rb +213 -0
- data/lib/clusterkit/clusterkit.rb +9 -0
- data/lib/clusterkit/configuration.rb +24 -0
- data/lib/clusterkit/dimensionality/pca.rb +251 -0
- data/lib/clusterkit/dimensionality/svd.rb +144 -0
- data/lib/clusterkit/dimensionality/umap.rb +311 -0
- data/lib/clusterkit/dimensionality.rb +29 -0
- data/lib/clusterkit/hdbscan_api_design.rb +142 -0
- data/lib/clusterkit/preprocessing.rb +106 -0
- data/lib/clusterkit/silence.rb +42 -0
- data/lib/clusterkit/utils.rb +51 -0
- data/lib/clusterkit/version.rb +5 -0
- data/lib/clusterkit.rb +93 -0
- data/lib/tasks/visualize.rake +641 -0
- metadata +194 -0
@@ -0,0 +1,106 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# Pure Ruby implementation of preprocessing functions
|
4
|
+
|
5
|
+
module ClusterKit
|
6
|
+
# Data preprocessing utilities
|
7
|
+
module Preprocessing
|
8
|
+
class << self
|
9
|
+
# Normalize data using specified method
|
10
|
+
# @param data [Array] Input data (2D array)
|
11
|
+
# @param method [Symbol] Normalization method (:standard, :minmax, :l2)
|
12
|
+
# @return [Array] Normalized data
|
13
|
+
def normalize(data, method: :standard)
|
14
|
+
raise ArgumentError, "Unsupported data type: #{data.class}" unless data.is_a?(Array)
|
15
|
+
|
16
|
+
case method
|
17
|
+
when :standard
|
18
|
+
standard_normalize(data)
|
19
|
+
when :minmax
|
20
|
+
minmax_normalize(data)
|
21
|
+
when :l2
|
22
|
+
l2_normalize(data)
|
23
|
+
else
|
24
|
+
raise ArgumentError, "Unknown normalization method: #{method}"
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
# Reduce dimensionality using PCA before embedding
|
29
|
+
# @param data [Array] Input data
|
30
|
+
# @param n_components [Integer] Number of PCA components
|
31
|
+
# @return [Array] Reduced data
|
32
|
+
def pca_reduce(data, n_components)
|
33
|
+
# Note: This would require SVD implementation in pure Ruby
|
34
|
+
# For now, raise an error suggesting to use the Rust-based SVD module
|
35
|
+
raise NotImplementedError, "PCA reduction requires the SVD module which needs to be called directly"
|
36
|
+
end
|
37
|
+
|
38
|
+
private
|
39
|
+
|
40
|
+
def standard_normalize(data)
|
41
|
+
# Pure Ruby implementation of standard normalization
|
42
|
+
return data if data.empty?
|
43
|
+
|
44
|
+
# Calculate mean and std for each column
|
45
|
+
n_rows = data.size
|
46
|
+
n_cols = data.first.size
|
47
|
+
|
48
|
+
means = Array.new(n_cols, 0.0)
|
49
|
+
stds = Array.new(n_cols, 0.0)
|
50
|
+
|
51
|
+
# Calculate means
|
52
|
+
data.each do |row|
|
53
|
+
row.each_with_index { |val, j| means[j] += val }
|
54
|
+
end
|
55
|
+
means.map! { |m| m / n_rows }
|
56
|
+
|
57
|
+
# Calculate standard deviations
|
58
|
+
data.each do |row|
|
59
|
+
row.each_with_index { |val, j| stds[j] += (val - means[j]) ** 2 }
|
60
|
+
end
|
61
|
+
stds.map! { |s| Math.sqrt(s / n_rows) }
|
62
|
+
stds.map! { |s| s == 0 ? 1.0 : s } # Avoid division by zero
|
63
|
+
|
64
|
+
# Normalize
|
65
|
+
data.map do |row|
|
66
|
+
row.map.with_index { |val, j| (val - means[j]) / stds[j] }
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
def minmax_normalize(data)
|
71
|
+
# Pure Ruby implementation of min-max normalization
|
72
|
+
return data if data.empty?
|
73
|
+
|
74
|
+
n_cols = data.first.size
|
75
|
+
mins = Array.new(n_cols) { Float::INFINITY }
|
76
|
+
maxs = Array.new(n_cols) { -Float::INFINITY }
|
77
|
+
|
78
|
+
# Find min and max for each column
|
79
|
+
data.each do |row|
|
80
|
+
row.each_with_index do |val, j|
|
81
|
+
mins[j] = val if val < mins[j]
|
82
|
+
maxs[j] = val if val > maxs[j]
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
# Calculate ranges
|
87
|
+
ranges = mins.zip(maxs).map { |min, max| max - min }
|
88
|
+
ranges.map! { |r| r == 0 ? 1.0 : r } # Avoid division by zero
|
89
|
+
|
90
|
+
# Normalize
|
91
|
+
data.map do |row|
|
92
|
+
row.map.with_index { |val, j| (val - mins[j]) / ranges[j] }
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
def l2_normalize(data)
|
97
|
+
# Pure Ruby implementation of L2 normalization
|
98
|
+
data.map do |row|
|
99
|
+
norm = Math.sqrt(row.sum { |val| val ** 2 })
|
100
|
+
norm = 1.0 if norm == 0 # Avoid division by zero
|
101
|
+
row.map { |val| val / norm }
|
102
|
+
end
|
103
|
+
end
|
104
|
+
end
|
105
|
+
end
|
106
|
+
end
|
@@ -0,0 +1,42 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module ClusterKit
|
4
|
+
# Module to suppress stdout output from the Rust library
|
5
|
+
# Following the pattern used by Rails/ActiveSupport and other popular gems
|
6
|
+
module Silence
|
7
|
+
# Temporarily silence stdout and stderr
|
8
|
+
# This is the most idiomatic Ruby way to suppress output from C/Rust extensions
|
9
|
+
#
|
10
|
+
# @example
|
11
|
+
# ClusterKit::Silence.silence_stream(STDOUT) do
|
12
|
+
# # code that produces unwanted output
|
13
|
+
# end
|
14
|
+
def self.silence_stream(stream)
|
15
|
+
old_stream = stream.dup
|
16
|
+
stream.reopen(File::NULL)
|
17
|
+
stream.sync = true
|
18
|
+
yield
|
19
|
+
ensure
|
20
|
+
stream.reopen(old_stream)
|
21
|
+
old_stream.close
|
22
|
+
end
|
23
|
+
|
24
|
+
# Silence both stdout and stderr
|
25
|
+
def self.silence_output
|
26
|
+
silence_stream(STDOUT) do
|
27
|
+
silence_stream(STDERR) do
|
28
|
+
yield
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
# Conditionally silence based on configuration
|
34
|
+
def self.maybe_silence
|
35
|
+
if ClusterKit.configuration.verbose
|
36
|
+
yield
|
37
|
+
else
|
38
|
+
silence_output { yield }
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
@@ -0,0 +1,51 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# Pure Ruby utility functions
|
4
|
+
|
5
|
+
module ClusterKit
|
6
|
+
# Utility functions for data analysis
|
7
|
+
module Utils
|
8
|
+
class << self
|
9
|
+
# Estimate the intrinsic dimension of data
|
10
|
+
# @param data [Array, Numo::NArray] Input data
|
11
|
+
# @param k_neighbors [Integer] Number of neighbors to consider
|
12
|
+
# @return [Float] Estimated intrinsic dimension
|
13
|
+
def estimate_intrinsic_dimension(data, k_neighbors: 10)
|
14
|
+
raise ArgumentError, "Unsupported data type: #{data.class}" unless data.is_a?(Array)
|
15
|
+
|
16
|
+
estimate_intrinsic_dimension_rust(data, k_neighbors)
|
17
|
+
end
|
18
|
+
|
19
|
+
# Estimate hubness in the data
|
20
|
+
# @param data [Array, Numo::NArray] Input data
|
21
|
+
# @return [Hash] Hubness statistics
|
22
|
+
def estimate_hubness(data)
|
23
|
+
raise ArgumentError, "Unsupported data type: #{data.class}" unless data.is_a?(Array)
|
24
|
+
|
25
|
+
result = estimate_hubness_rust(data)
|
26
|
+
symbolize_keys(result)
|
27
|
+
end
|
28
|
+
|
29
|
+
# Measure neighborhood stability through embedding
|
30
|
+
# @param original_data [Array, Numo::NArray] Original high-dimensional data
|
31
|
+
# @param embedded_data [Array, Numo::NArray] Embedded low-dimensional data
|
32
|
+
# @param k [Integer] Number of neighbors to check
|
33
|
+
# @return [Float] Stability score (0-1, higher is better)
|
34
|
+
def neighborhood_stability(original_data, embedded_data, k: 15)
|
35
|
+
raise ArgumentError, "Unsupported data type: #{original_data.class}" unless original_data.is_a?(Array)
|
36
|
+
raise ArgumentError, "Unsupported data type: #{embedded_data.class}" unless embedded_data.is_a?(Array)
|
37
|
+
|
38
|
+
# TODO: Implement neighborhood stability calculation
|
39
|
+
raise NotImplementedError, "Neighborhood stability not implemented yet"
|
40
|
+
end
|
41
|
+
|
42
|
+
private
|
43
|
+
|
44
|
+
def symbolize_keys(hash)
|
45
|
+
return hash unless hash.is_a?(Hash)
|
46
|
+
|
47
|
+
hash.transform_keys { |key| key.to_s.to_sym }
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
data/lib/clusterkit.rb
ADDED
@@ -0,0 +1,93 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative "clusterkit/version"
|
4
|
+
require_relative "clusterkit/clusterkit"
|
5
|
+
require_relative "clusterkit/configuration"
|
6
|
+
|
7
|
+
# Main module for ClusterKit gem
|
8
|
+
# Provides high-performance dimensionality reduction algorithms
|
9
|
+
module ClusterKit
|
10
|
+
class Error < StandardError; end
|
11
|
+
|
12
|
+
|
13
|
+
# Core error classes
|
14
|
+
class DimensionError < Error; end
|
15
|
+
class ConvergenceError < Error; end
|
16
|
+
class InvalidParameterError < Error; end
|
17
|
+
|
18
|
+
# Data-related errors
|
19
|
+
class DataError < Error; end
|
20
|
+
class IsolatedPointError < DataError; end
|
21
|
+
class DisconnectedGraphError < DataError; end
|
22
|
+
class InsufficientDataError < DataError; end
|
23
|
+
|
24
|
+
# Load modules - can't use autoload with require_relative path issues
|
25
|
+
require_relative "clusterkit/dimensionality"
|
26
|
+
require_relative "clusterkit/clustering"
|
27
|
+
|
28
|
+
# Autoload utilities
|
29
|
+
autoload :Utils, "clusterkit/utils"
|
30
|
+
autoload :Preprocessing, "clusterkit/preprocessing"
|
31
|
+
autoload :Silence, "clusterkit/silence"
|
32
|
+
|
33
|
+
# Load the extension first
|
34
|
+
require_relative "clusterkit/clusterkit"
|
35
|
+
|
36
|
+
# Now load the modules that depend on the extension
|
37
|
+
require_relative "clusterkit/clustering"
|
38
|
+
|
39
|
+
class << self
|
40
|
+
# Quick UMAP embedding
|
41
|
+
# @param data [Array] Input data
|
42
|
+
# @param n_components [Integer] Number of dimensions in output
|
43
|
+
# @return [Array] Embedded data
|
44
|
+
def umap(data, n_components: 2, **options)
|
45
|
+
umap = Dimensionality::UMAP.new(n_components: n_components, **options)
|
46
|
+
umap.fit_transform(data)
|
47
|
+
end
|
48
|
+
|
49
|
+
# Quick PCA
|
50
|
+
# @param data [Array] Input data
|
51
|
+
# @param n_components [Integer] Number of dimensions in output
|
52
|
+
# @return [Array] Transformed data
|
53
|
+
def pca(data, n_components: 2)
|
54
|
+
pca = Dimensionality::PCA.new(n_components: n_components)
|
55
|
+
pca.fit_transform(data)
|
56
|
+
end
|
57
|
+
|
58
|
+
# t-SNE is not yet implemented
|
59
|
+
# @deprecated Not implemented - use UMAP instead
|
60
|
+
def tsne(data, n_components: 2, **options)
|
61
|
+
raise NotImplementedError, "t-SNE is not yet implemented. Please use UMAP instead, which provides similar dimensionality reduction capabilities."
|
62
|
+
end
|
63
|
+
|
64
|
+
# Estimate intrinsic dimension of data
|
65
|
+
# @param data [Array, Numo::NArray] Input data
|
66
|
+
# @param k [Integer] Number of neighbors to consider
|
67
|
+
# @return [Float] Estimated intrinsic dimension
|
68
|
+
def estimate_dimension(data, k: 10)
|
69
|
+
Utils.estimate_intrinsic_dimension(data, k_neighbors: k)
|
70
|
+
end
|
71
|
+
|
72
|
+
# Perform SVD
|
73
|
+
# @param matrix [Array] Input matrix
|
74
|
+
# @param k [Integer] Number of components
|
75
|
+
# @param n_iter [Integer] Number of iterations for randomized algorithm
|
76
|
+
# @return [Array] U, S, V matrices
|
77
|
+
def svd(matrix, k, n_iter: 2)
|
78
|
+
svd = Dimensionality::SVD.new(n_components: k, n_iter: n_iter)
|
79
|
+
svd.fit_transform(matrix)
|
80
|
+
end
|
81
|
+
|
82
|
+
# Quick K-means with automatic k detection
|
83
|
+
# @param data [Array] Input data
|
84
|
+
# @param k [Integer, nil] Number of clusters (auto-detect if nil)
|
85
|
+
# @param k_range [Range] Range for auto-detection
|
86
|
+
# @return [Array] Cluster labels
|
87
|
+
def kmeans(data, k: nil, k_range: 2..10, **options)
|
88
|
+
k ||= Clustering::KMeans.optimal_k(data, k_range: k_range)
|
89
|
+
kmeans = Clustering::KMeans.new(k: k, **options)
|
90
|
+
kmeans.fit_predict(data)
|
91
|
+
end
|
92
|
+
end
|
93
|
+
end
|