clusterkit 0.1.0.pre.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. checksums.yaml +7 -0
  2. data/.rspec +3 -0
  3. data/.simplecov +47 -0
  4. data/CHANGELOG.md +35 -0
  5. data/CLAUDE.md +226 -0
  6. data/Cargo.toml +8 -0
  7. data/Gemfile +17 -0
  8. data/IMPLEMENTATION_NOTES.md +143 -0
  9. data/LICENSE.txt +21 -0
  10. data/PYTHON_COMPARISON.md +183 -0
  11. data/README.md +499 -0
  12. data/Rakefile +245 -0
  13. data/clusterkit.gemspec +45 -0
  14. data/docs/KNOWN_ISSUES.md +130 -0
  15. data/docs/RUST_ERROR_HANDLING.md +164 -0
  16. data/docs/TEST_FIXTURES.md +170 -0
  17. data/docs/UMAP_EXPLAINED.md +362 -0
  18. data/docs/UMAP_TROUBLESHOOTING.md +284 -0
  19. data/docs/VERBOSE_OUTPUT.md +84 -0
  20. data/examples/hdbscan_example.rb +147 -0
  21. data/examples/optimal_kmeans_example.rb +96 -0
  22. data/examples/pca_example.rb +114 -0
  23. data/examples/reproducible_umap.rb +99 -0
  24. data/examples/verbose_control.rb +43 -0
  25. data/ext/clusterkit/Cargo.toml +25 -0
  26. data/ext/clusterkit/extconf.rb +4 -0
  27. data/ext/clusterkit/src/clustering/hdbscan_wrapper.rs +115 -0
  28. data/ext/clusterkit/src/clustering.rs +267 -0
  29. data/ext/clusterkit/src/embedder.rs +413 -0
  30. data/ext/clusterkit/src/lib.rs +22 -0
  31. data/ext/clusterkit/src/svd.rs +112 -0
  32. data/ext/clusterkit/src/tests.rs +16 -0
  33. data/ext/clusterkit/src/utils.rs +33 -0
  34. data/lib/clusterkit/clustering/hdbscan.rb +177 -0
  35. data/lib/clusterkit/clustering.rb +213 -0
  36. data/lib/clusterkit/clusterkit.rb +9 -0
  37. data/lib/clusterkit/configuration.rb +24 -0
  38. data/lib/clusterkit/dimensionality/pca.rb +251 -0
  39. data/lib/clusterkit/dimensionality/svd.rb +144 -0
  40. data/lib/clusterkit/dimensionality/umap.rb +311 -0
  41. data/lib/clusterkit/dimensionality.rb +29 -0
  42. data/lib/clusterkit/hdbscan_api_design.rb +142 -0
  43. data/lib/clusterkit/preprocessing.rb +106 -0
  44. data/lib/clusterkit/silence.rb +42 -0
  45. data/lib/clusterkit/utils.rb +51 -0
  46. data/lib/clusterkit/version.rb +5 -0
  47. data/lib/clusterkit.rb +93 -0
  48. data/lib/tasks/visualize.rake +641 -0
  49. metadata +194 -0
@@ -0,0 +1,106 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Pure Ruby implementation of preprocessing functions
4
+
5
+ module ClusterKit
6
+ # Data preprocessing utilities
7
+ module Preprocessing
8
+ class << self
9
+ # Normalize data using specified method
10
+ # @param data [Array] Input data (2D array)
11
+ # @param method [Symbol] Normalization method (:standard, :minmax, :l2)
12
+ # @return [Array] Normalized data
13
+ def normalize(data, method: :standard)
14
+ raise ArgumentError, "Unsupported data type: #{data.class}" unless data.is_a?(Array)
15
+
16
+ case method
17
+ when :standard
18
+ standard_normalize(data)
19
+ when :minmax
20
+ minmax_normalize(data)
21
+ when :l2
22
+ l2_normalize(data)
23
+ else
24
+ raise ArgumentError, "Unknown normalization method: #{method}"
25
+ end
26
+ end
27
+
28
+ # Reduce dimensionality using PCA before embedding
29
+ # @param data [Array] Input data
30
+ # @param n_components [Integer] Number of PCA components
31
+ # @return [Array] Reduced data
32
+ def pca_reduce(data, n_components)
33
+ # Note: This would require SVD implementation in pure Ruby
34
+ # For now, raise an error suggesting to use the Rust-based SVD module
35
+ raise NotImplementedError, "PCA reduction requires the SVD module which needs to be called directly"
36
+ end
37
+
38
+ private
39
+
40
+ def standard_normalize(data)
41
+ # Pure Ruby implementation of standard normalization
42
+ return data if data.empty?
43
+
44
+ # Calculate mean and std for each column
45
+ n_rows = data.size
46
+ n_cols = data.first.size
47
+
48
+ means = Array.new(n_cols, 0.0)
49
+ stds = Array.new(n_cols, 0.0)
50
+
51
+ # Calculate means
52
+ data.each do |row|
53
+ row.each_with_index { |val, j| means[j] += val }
54
+ end
55
+ means.map! { |m| m / n_rows }
56
+
57
+ # Calculate standard deviations
58
+ data.each do |row|
59
+ row.each_with_index { |val, j| stds[j] += (val - means[j]) ** 2 }
60
+ end
61
+ stds.map! { |s| Math.sqrt(s / n_rows) }
62
+ stds.map! { |s| s == 0 ? 1.0 : s } # Avoid division by zero
63
+
64
+ # Normalize
65
+ data.map do |row|
66
+ row.map.with_index { |val, j| (val - means[j]) / stds[j] }
67
+ end
68
+ end
69
+
70
+ def minmax_normalize(data)
71
+ # Pure Ruby implementation of min-max normalization
72
+ return data if data.empty?
73
+
74
+ n_cols = data.first.size
75
+ mins = Array.new(n_cols) { Float::INFINITY }
76
+ maxs = Array.new(n_cols) { -Float::INFINITY }
77
+
78
+ # Find min and max for each column
79
+ data.each do |row|
80
+ row.each_with_index do |val, j|
81
+ mins[j] = val if val < mins[j]
82
+ maxs[j] = val if val > maxs[j]
83
+ end
84
+ end
85
+
86
+ # Calculate ranges
87
+ ranges = mins.zip(maxs).map { |min, max| max - min }
88
+ ranges.map! { |r| r == 0 ? 1.0 : r } # Avoid division by zero
89
+
90
+ # Normalize
91
+ data.map do |row|
92
+ row.map.with_index { |val, j| (val - mins[j]) / ranges[j] }
93
+ end
94
+ end
95
+
96
+ def l2_normalize(data)
97
+ # Pure Ruby implementation of L2 normalization
98
+ data.map do |row|
99
+ norm = Math.sqrt(row.sum { |val| val ** 2 })
100
+ norm = 1.0 if norm == 0 # Avoid division by zero
101
+ row.map { |val| val / norm }
102
+ end
103
+ end
104
+ end
105
+ end
106
+ end
@@ -0,0 +1,42 @@
1
+ # frozen_string_literal: true
2
+
3
+ module ClusterKit
4
+ # Module to suppress stdout output from the Rust library
5
+ # Following the pattern used by Rails/ActiveSupport and other popular gems
6
+ module Silence
7
+ # Temporarily silence stdout and stderr
8
+ # This is the most idiomatic Ruby way to suppress output from C/Rust extensions
9
+ #
10
+ # @example
11
+ # ClusterKit::Silence.silence_stream(STDOUT) do
12
+ # # code that produces unwanted output
13
+ # end
14
+ def self.silence_stream(stream)
15
+ old_stream = stream.dup
16
+ stream.reopen(File::NULL)
17
+ stream.sync = true
18
+ yield
19
+ ensure
20
+ stream.reopen(old_stream)
21
+ old_stream.close
22
+ end
23
+
24
+ # Silence both stdout and stderr
25
+ def self.silence_output
26
+ silence_stream(STDOUT) do
27
+ silence_stream(STDERR) do
28
+ yield
29
+ end
30
+ end
31
+ end
32
+
33
+ # Conditionally silence based on configuration
34
+ def self.maybe_silence
35
+ if ClusterKit.configuration.verbose
36
+ yield
37
+ else
38
+ silence_output { yield }
39
+ end
40
+ end
41
+ end
42
+ end
@@ -0,0 +1,51 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Pure Ruby utility functions
4
+
5
+ module ClusterKit
6
+ # Utility functions for data analysis
7
+ module Utils
8
+ class << self
9
+ # Estimate the intrinsic dimension of data
10
+ # @param data [Array, Numo::NArray] Input data
11
+ # @param k_neighbors [Integer] Number of neighbors to consider
12
+ # @return [Float] Estimated intrinsic dimension
13
+ def estimate_intrinsic_dimension(data, k_neighbors: 10)
14
+ raise ArgumentError, "Unsupported data type: #{data.class}" unless data.is_a?(Array)
15
+
16
+ estimate_intrinsic_dimension_rust(data, k_neighbors)
17
+ end
18
+
19
+ # Estimate hubness in the data
20
+ # @param data [Array, Numo::NArray] Input data
21
+ # @return [Hash] Hubness statistics
22
+ def estimate_hubness(data)
23
+ raise ArgumentError, "Unsupported data type: #{data.class}" unless data.is_a?(Array)
24
+
25
+ result = estimate_hubness_rust(data)
26
+ symbolize_keys(result)
27
+ end
28
+
29
+ # Measure neighborhood stability through embedding
30
+ # @param original_data [Array, Numo::NArray] Original high-dimensional data
31
+ # @param embedded_data [Array, Numo::NArray] Embedded low-dimensional data
32
+ # @param k [Integer] Number of neighbors to check
33
+ # @return [Float] Stability score (0-1, higher is better)
34
+ def neighborhood_stability(original_data, embedded_data, k: 15)
35
+ raise ArgumentError, "Unsupported data type: #{original_data.class}" unless original_data.is_a?(Array)
36
+ raise ArgumentError, "Unsupported data type: #{embedded_data.class}" unless embedded_data.is_a?(Array)
37
+
38
+ # TODO: Implement neighborhood stability calculation
39
+ raise NotImplementedError, "Neighborhood stability not implemented yet"
40
+ end
41
+
42
+ private
43
+
44
+ def symbolize_keys(hash)
45
+ return hash unless hash.is_a?(Hash)
46
+
47
+ hash.transform_keys { |key| key.to_s.to_sym }
48
+ end
49
+ end
50
+ end
51
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module ClusterKit
4
+ VERSION = "0.1.0.pre.1"
5
+ end
data/lib/clusterkit.rb ADDED
@@ -0,0 +1,93 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "clusterkit/version"
4
+ require_relative "clusterkit/clusterkit"
5
+ require_relative "clusterkit/configuration"
6
+
7
+ # Main module for ClusterKit gem
8
+ # Provides high-performance dimensionality reduction algorithms
9
+ module ClusterKit
10
+ class Error < StandardError; end
11
+
12
+
13
+ # Core error classes
14
+ class DimensionError < Error; end
15
+ class ConvergenceError < Error; end
16
+ class InvalidParameterError < Error; end
17
+
18
+ # Data-related errors
19
+ class DataError < Error; end
20
+ class IsolatedPointError < DataError; end
21
+ class DisconnectedGraphError < DataError; end
22
+ class InsufficientDataError < DataError; end
23
+
24
+ # Load modules - can't use autoload with require_relative path issues
25
+ require_relative "clusterkit/dimensionality"
26
+ require_relative "clusterkit/clustering"
27
+
28
+ # Autoload utilities
29
+ autoload :Utils, "clusterkit/utils"
30
+ autoload :Preprocessing, "clusterkit/preprocessing"
31
+ autoload :Silence, "clusterkit/silence"
32
+
33
+ # Load the extension first
34
+ require_relative "clusterkit/clusterkit"
35
+
36
+ # Now load the modules that depend on the extension
37
+ require_relative "clusterkit/clustering"
38
+
39
+ class << self
40
+ # Quick UMAP embedding
41
+ # @param data [Array] Input data
42
+ # @param n_components [Integer] Number of dimensions in output
43
+ # @return [Array] Embedded data
44
+ def umap(data, n_components: 2, **options)
45
+ umap = Dimensionality::UMAP.new(n_components: n_components, **options)
46
+ umap.fit_transform(data)
47
+ end
48
+
49
+ # Quick PCA
50
+ # @param data [Array] Input data
51
+ # @param n_components [Integer] Number of dimensions in output
52
+ # @return [Array] Transformed data
53
+ def pca(data, n_components: 2)
54
+ pca = Dimensionality::PCA.new(n_components: n_components)
55
+ pca.fit_transform(data)
56
+ end
57
+
58
+ # t-SNE is not yet implemented
59
+ # @deprecated Not implemented - use UMAP instead
60
+ def tsne(data, n_components: 2, **options)
61
+ raise NotImplementedError, "t-SNE is not yet implemented. Please use UMAP instead, which provides similar dimensionality reduction capabilities."
62
+ end
63
+
64
+ # Estimate intrinsic dimension of data
65
+ # @param data [Array, Numo::NArray] Input data
66
+ # @param k [Integer] Number of neighbors to consider
67
+ # @return [Float] Estimated intrinsic dimension
68
+ def estimate_dimension(data, k: 10)
69
+ Utils.estimate_intrinsic_dimension(data, k_neighbors: k)
70
+ end
71
+
72
+ # Perform SVD
73
+ # @param matrix [Array] Input matrix
74
+ # @param k [Integer] Number of components
75
+ # @param n_iter [Integer] Number of iterations for randomized algorithm
76
+ # @return [Array] U, S, V matrices
77
+ def svd(matrix, k, n_iter: 2)
78
+ svd = Dimensionality::SVD.new(n_components: k, n_iter: n_iter)
79
+ svd.fit_transform(matrix)
80
+ end
81
+
82
+ # Quick K-means with automatic k detection
83
+ # @param data [Array] Input data
84
+ # @param k [Integer, nil] Number of clusters (auto-detect if nil)
85
+ # @param k_range [Range] Range for auto-detection
86
+ # @return [Array] Cluster labels
87
+ def kmeans(data, k: nil, k_range: 2..10, **options)
88
+ k ||= Clustering::KMeans.optimal_k(data, k_range: k_range)
89
+ kmeans = Clustering::KMeans.new(k: k, **options)
90
+ kmeans.fit_predict(data)
91
+ end
92
+ end
93
+ end