picky 0.0.0 → 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/bin/picky +14 -0
- data/lib/bundling.rb +10 -0
- data/lib/constants.rb +9 -0
- data/lib/deployment.rb +212 -0
- data/lib/picky/application.rb +40 -0
- data/lib/picky/cacher/convenience.rb +3 -0
- data/lib/picky/cacher/generator.rb +17 -0
- data/lib/picky/cacher/partial/default.rb +7 -0
- data/lib/picky/cacher/partial/none.rb +19 -0
- data/lib/picky/cacher/partial/strategy.rb +7 -0
- data/lib/picky/cacher/partial/subtoken.rb +91 -0
- data/lib/picky/cacher/partial_generator.rb +15 -0
- data/lib/picky/cacher/similarity/default.rb +7 -0
- data/lib/picky/cacher/similarity/double_levenshtone.rb +73 -0
- data/lib/picky/cacher/similarity/none.rb +25 -0
- data/lib/picky/cacher/similarity/strategy.rb +7 -0
- data/lib/picky/cacher/similarity_generator.rb +15 -0
- data/lib/picky/cacher/weights/default.rb +7 -0
- data/lib/picky/cacher/weights/logarithmic.rb +39 -0
- data/lib/picky/cacher/weights/strategy.rb +7 -0
- data/lib/picky/cacher/weights_generator.rb +15 -0
- data/lib/picky/configuration/configuration.rb +13 -0
- data/lib/picky/configuration/field.rb +68 -0
- data/lib/picky/configuration/indexes.rb +60 -0
- data/lib/picky/configuration/queries.rb +32 -0
- data/lib/picky/configuration/type.rb +52 -0
- data/lib/picky/cores.rb +101 -0
- data/lib/picky/db/configuration.rb +23 -0
- data/lib/picky/ext/ruby19/extconf.rb +7 -0
- data/lib/picky/ext/ruby19/performant.c +339 -0
- data/lib/picky/extensions/array.rb +45 -0
- data/lib/picky/extensions/hash.rb +11 -0
- data/lib/picky/extensions/module.rb +15 -0
- data/lib/picky/extensions/symbol.rb +18 -0
- data/lib/picky/generator.rb +156 -0
- data/lib/picky/helpers/cache.rb +23 -0
- data/lib/picky/helpers/gc.rb +11 -0
- data/lib/picky/helpers/measuring.rb +45 -0
- data/lib/picky/helpers/search.rb +27 -0
- data/lib/picky/index/bundle.rb +328 -0
- data/lib/picky/index/category.rb +109 -0
- data/lib/picky/index/combined.rb +38 -0
- data/lib/picky/index/type.rb +30 -0
- data/lib/picky/indexers/base.rb +77 -0
- data/lib/picky/indexers/default.rb +3 -0
- data/lib/picky/indexers/field.rb +13 -0
- data/lib/picky/indexers/no_source_specified_error.rb +5 -0
- data/lib/picky/indexers/solr.rb +60 -0
- data/lib/picky/indexes.rb +180 -0
- data/lib/picky/initializers/ext.rb +6 -0
- data/lib/picky/initializers/mysql.rb +22 -0
- data/lib/picky/loader.rb +287 -0
- data/lib/picky/loggers/search.rb +19 -0
- data/lib/picky/performant/array.rb +23 -0
- data/lib/picky/query/allocation.rb +82 -0
- data/lib/picky/query/allocations.rb +131 -0
- data/lib/picky/query/base.rb +124 -0
- data/lib/picky/query/combination.rb +69 -0
- data/lib/picky/query/combinations.rb +106 -0
- data/lib/picky/query/combinator.rb +92 -0
- data/lib/picky/query/full.rb +15 -0
- data/lib/picky/query/live.rb +22 -0
- data/lib/picky/query/qualifiers.rb +73 -0
- data/lib/picky/query/solr.rb +77 -0
- data/lib/picky/query/token.rb +215 -0
- data/lib/picky/query/tokens.rb +102 -0
- data/lib/picky/query/weigher.rb +159 -0
- data/lib/picky/query/weights.rb +55 -0
- data/lib/picky/rack/harakiri.rb +37 -0
- data/lib/picky/results/base.rb +103 -0
- data/lib/picky/results/full.rb +19 -0
- data/lib/picky/results/live.rb +19 -0
- data/lib/picky/routing.rb +165 -0
- data/lib/picky/signals.rb +11 -0
- data/lib/picky/solr/schema_generator.rb +73 -0
- data/lib/picky/sources/base.rb +19 -0
- data/lib/picky/sources/csv.rb +30 -0
- data/lib/picky/sources/db.rb +77 -0
- data/lib/picky/tokenizers/base.rb +130 -0
- data/lib/picky/tokenizers/default.rb +3 -0
- data/lib/picky/tokenizers/index.rb +73 -0
- data/lib/picky/tokenizers/query.rb +70 -0
- data/lib/picky/umlaut_substituter.rb +21 -0
- data/lib/picky-tasks.rb +6 -0
- data/lib/picky.rb +18 -0
- data/lib/tasks/application.rake +5 -0
- data/lib/tasks/cache.rake +53 -0
- data/lib/tasks/framework.rake +4 -0
- data/lib/tasks/index.rake +29 -0
- data/lib/tasks/server.rake +48 -0
- data/lib/tasks/shortcuts.rake +13 -0
- data/lib/tasks/solr.rake +36 -0
- data/lib/tasks/spec.rake +11 -0
- data/lib/tasks/statistics.rake +13 -0
- data/lib/tasks/try.rake +29 -0
- data/prototype_project/Gemfile +23 -0
- data/prototype_project/Rakefile +1 -0
- data/prototype_project/app/README +6 -0
- data/prototype_project/app/application.rb +50 -0
- data/prototype_project/app/application.ru +29 -0
- data/prototype_project/app/db.yml +10 -0
- data/prototype_project/app/logging.rb +20 -0
- data/prototype_project/app/unicorn.ru +10 -0
- data/prototype_project/log/README +1 -0
- data/prototype_project/script/console +34 -0
- data/prototype_project/tmp/README +0 -0
- data/prototype_project/tmp/pids/README +0 -0
- data/spec/ext/performant_spec.rb +64 -0
- data/spec/lib/application_spec.rb +61 -0
- data/spec/lib/cacher/partial/subtoken_spec.rb +89 -0
- data/spec/lib/cacher/partial_generator_spec.rb +35 -0
- data/spec/lib/cacher/similarity/double_levenshtone_spec.rb +60 -0
- data/spec/lib/cacher/similarity/none_spec.rb +23 -0
- data/spec/lib/cacher/similarity_generator_spec.rb +22 -0
- data/spec/lib/cacher/weights/logarithmic_spec.rb +30 -0
- data/spec/lib/cacher/weights_generator_spec.rb +21 -0
- data/spec/lib/configuration/configuration_spec.rb +38 -0
- data/spec/lib/configuration/type_spec.rb +49 -0
- data/spec/lib/configuration_spec.rb +8 -0
- data/spec/lib/cores_spec.rb +65 -0
- data/spec/lib/extensions/array_spec.rb +37 -0
- data/spec/lib/extensions/hash_spec.rb +11 -0
- data/spec/lib/extensions/module_spec.rb +27 -0
- data/spec/lib/extensions/symbol_spec.rb +85 -0
- data/spec/lib/generator_spec.rb +135 -0
- data/spec/lib/helpers/cache_spec.rb +35 -0
- data/spec/lib/helpers/gc_spec.rb +71 -0
- data/spec/lib/helpers/measuring_spec.rb +18 -0
- data/spec/lib/helpers/search_spec.rb +50 -0
- data/spec/lib/index/bundle_partial_generation_speed_spec.rb +47 -0
- data/spec/lib/index/bundle_spec.rb +260 -0
- data/spec/lib/index/category_spec.rb +203 -0
- data/spec/lib/indexers/base_spec.rb +73 -0
- data/spec/lib/indexers/field_spec.rb +20 -0
- data/spec/lib/loader_spec.rb +48 -0
- data/spec/lib/loggers/search_spec.rb +19 -0
- data/spec/lib/performant/array_spec.rb +13 -0
- data/spec/lib/query/allocation_spec.rb +194 -0
- data/spec/lib/query/allocations_spec.rb +336 -0
- data/spec/lib/query/base_spec.rb +104 -0
- data/spec/lib/query/combination_spec.rb +90 -0
- data/spec/lib/query/combinations_spec.rb +83 -0
- data/spec/lib/query/combinator_spec.rb +112 -0
- data/spec/lib/query/full_spec.rb +22 -0
- data/spec/lib/query/live_spec.rb +61 -0
- data/spec/lib/query/qualifiers_spec.rb +31 -0
- data/spec/lib/query/solr_spec.rb +51 -0
- data/spec/lib/query/token_spec.rb +297 -0
- data/spec/lib/query/tokens_spec.rb +189 -0
- data/spec/lib/query/weights_spec.rb +47 -0
- data/spec/lib/results/base_spec.rb +233 -0
- data/spec/lib/routing_spec.rb +318 -0
- data/spec/lib/solr/schema_generator_spec.rb +42 -0
- data/spec/lib/sources/db_spec.rb +91 -0
- data/spec/lib/tokenizers/base_spec.rb +61 -0
- data/spec/lib/tokenizers/index_spec.rb +51 -0
- data/spec/lib/tokenizers/query_spec.rb +105 -0
- data/spec/lib/umlaut_substituter_spec.rb +84 -0
- data/spec/specific/speed_spec.rb +55 -0
- metadata +371 -15
- data/README.textile +0 -9
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
module Configuration
|
|
2
|
+
|
|
3
|
+
class Field
|
|
4
|
+
attr_reader :name, :indexed_name, :virtual
|
|
5
|
+
attr_accessor :type # convenience
|
|
6
|
+
def initialize name, options = {}
|
|
7
|
+
@name = name
|
|
8
|
+
|
|
9
|
+
# TODO Dup the options?
|
|
10
|
+
# TODO add source as option
|
|
11
|
+
|
|
12
|
+
@indexer_class = options.delete(:indexer) || Indexers::Default
|
|
13
|
+
@tokenizer_class = options.delete(:tokenizer) || Tokenizers::Index # Default
|
|
14
|
+
|
|
15
|
+
@indexed_name = options.delete(:indexed_field) || name # TODO Rename to indexed_as?
|
|
16
|
+
@virtual = options.delete(:virtual) || false
|
|
17
|
+
|
|
18
|
+
Query::Qualifiers.add(name, options[:qualifiers]) if options[:qualifiers]
|
|
19
|
+
|
|
20
|
+
# @remove = options[:remove] || false
|
|
21
|
+
# @filter = options[:filter] || true
|
|
22
|
+
|
|
23
|
+
@options = options
|
|
24
|
+
end
|
|
25
|
+
def source
|
|
26
|
+
@source || type.source
|
|
27
|
+
end
|
|
28
|
+
def generate
|
|
29
|
+
Index::Category.new self.name, type, @options
|
|
30
|
+
end
|
|
31
|
+
# TODO Duplicate code in bundle. Move to application.
|
|
32
|
+
#
|
|
33
|
+
# TODO Move to type, and use in bundle from there.
|
|
34
|
+
#
|
|
35
|
+
def search_index_root
|
|
36
|
+
File.join SEARCH_ROOT, 'index'
|
|
37
|
+
end
|
|
38
|
+
# TODO Move to config. Duplicate Code in field.rb.
|
|
39
|
+
#
|
|
40
|
+
def cache_directory
|
|
41
|
+
File.join search_index_root, SEARCH_ENVIRONMENT, type.name.to_s
|
|
42
|
+
end
|
|
43
|
+
def search_index_file_name
|
|
44
|
+
File.join cache_directory, "#{type.name}_#{name}_index.txt"
|
|
45
|
+
end
|
|
46
|
+
def index
|
|
47
|
+
prepare_cache_directory
|
|
48
|
+
indexer.index
|
|
49
|
+
end
|
|
50
|
+
def prepare_cache_directory
|
|
51
|
+
FileUtils.mkdir_p cache_directory
|
|
52
|
+
end
|
|
53
|
+
def cache
|
|
54
|
+
prepare_cache_directory
|
|
55
|
+
generate.generate_caches
|
|
56
|
+
end
|
|
57
|
+
def indexer
|
|
58
|
+
@indexer || @indexer = @indexer_class.new(type, self)
|
|
59
|
+
end
|
|
60
|
+
def tokenizer
|
|
61
|
+
@tokenizer || @tokenizer = @tokenizer_class.new # TODO Make instances.
|
|
62
|
+
end
|
|
63
|
+
def virtual?
|
|
64
|
+
!!virtual
|
|
65
|
+
end
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
end
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
module Configuration
|
|
2
|
+
|
|
3
|
+
class Indexes
|
|
4
|
+
|
|
5
|
+
attr_reader :types
|
|
6
|
+
|
|
7
|
+
def initialize *types
|
|
8
|
+
@types = types
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
#
|
|
12
|
+
#
|
|
13
|
+
def default_index
|
|
14
|
+
Tokenizers::Index
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
# Delegates
|
|
18
|
+
#
|
|
19
|
+
delegate :illegal_characters, :contract_expressions, :stopwords, :split_text_on, :normalize_words, :illegal_characters_after, :to => :default_index
|
|
20
|
+
|
|
21
|
+
#
|
|
22
|
+
#
|
|
23
|
+
def type name, *fields
|
|
24
|
+
types << Type.new(name, *fields)
|
|
25
|
+
end
|
|
26
|
+
def field name, options = {}
|
|
27
|
+
Field.new name, options
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
#
|
|
31
|
+
#
|
|
32
|
+
def take_snapshot *type_names
|
|
33
|
+
only_if_included_in type_names do |type|
|
|
34
|
+
type.take_snapshot
|
|
35
|
+
end
|
|
36
|
+
end
|
|
37
|
+
def index *type_names
|
|
38
|
+
only_if_included_in type_names do |type|
|
|
39
|
+
type.index
|
|
40
|
+
end
|
|
41
|
+
end
|
|
42
|
+
def index_solr *type_names
|
|
43
|
+
only_if_included_in type_names do |type|
|
|
44
|
+
type.index_solr
|
|
45
|
+
end
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
#
|
|
49
|
+
#
|
|
50
|
+
def only_if_included_in type_names = []
|
|
51
|
+
type_names = types.map(&:name) if type_names.empty?
|
|
52
|
+
types.each do |type|
|
|
53
|
+
next unless type_names.include?(type.name)
|
|
54
|
+
yield type
|
|
55
|
+
end
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
end
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
module Configuration
|
|
2
|
+
|
|
3
|
+
class Queries
|
|
4
|
+
|
|
5
|
+
attr_reader :routing
|
|
6
|
+
|
|
7
|
+
#
|
|
8
|
+
#
|
|
9
|
+
def initialize routing
|
|
10
|
+
@routing = routing
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
#
|
|
14
|
+
#
|
|
15
|
+
def default_index
|
|
16
|
+
Tokenizers::Query
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
# Routes.
|
|
20
|
+
#
|
|
21
|
+
delegate :defaults, :route, :live, :full, :root, :default, :to => :routing
|
|
22
|
+
|
|
23
|
+
# Delegates.
|
|
24
|
+
#
|
|
25
|
+
def maximum_tokens amount
|
|
26
|
+
Query::Tokens.maximum = amount
|
|
27
|
+
end
|
|
28
|
+
delegate :illegal_characters, :contract_expressions, :stopwords, :split_text_on, :normalize_words, :illegal_characters_after, :to => :default_index
|
|
29
|
+
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
end
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
module Configuration
|
|
2
|
+
class Type
|
|
3
|
+
attr_reader :name,
|
|
4
|
+
:source,
|
|
5
|
+
:fields,
|
|
6
|
+
:after_indexing,
|
|
7
|
+
:result_type,
|
|
8
|
+
:ignore_unassigned_tokens,
|
|
9
|
+
:solr
|
|
10
|
+
def initialize name, source, *fields, options
|
|
11
|
+
if Configuration::Field === options
|
|
12
|
+
fields << options
|
|
13
|
+
options = {}
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
@name = name
|
|
17
|
+
@source = source
|
|
18
|
+
# dup, if field is reused. TODO Rewrite.
|
|
19
|
+
@fields = fields.map { |field| field = field.dup; field.type = self; field }
|
|
20
|
+
|
|
21
|
+
@after_indexing = options[:after_indexing]
|
|
22
|
+
@result_type = options[:result_type] || name
|
|
23
|
+
@ignore_unassigned_tokens = options[:ignore_unassigned_tokens] || false # TODO Move to query?
|
|
24
|
+
@solr = options[:solr] || nil
|
|
25
|
+
end
|
|
26
|
+
def generate
|
|
27
|
+
categories = fields.map { |field| field.generate }
|
|
28
|
+
Index::Type.new name, result_type, ignore_unassigned_tokens, *categories
|
|
29
|
+
end
|
|
30
|
+
def table_name
|
|
31
|
+
self # FIXME UGH, Remove anyway
|
|
32
|
+
end
|
|
33
|
+
def take_snapshot
|
|
34
|
+
source.take_snapshot self
|
|
35
|
+
end
|
|
36
|
+
def index
|
|
37
|
+
fields.each do |field|
|
|
38
|
+
field.index
|
|
39
|
+
end
|
|
40
|
+
end
|
|
41
|
+
def solr_fields
|
|
42
|
+
solr ? fields.select { |field| !field.virtual? } : []
|
|
43
|
+
end
|
|
44
|
+
# TODO Delegate to Solr handler.
|
|
45
|
+
#
|
|
46
|
+
def index_solr
|
|
47
|
+
return unless solr
|
|
48
|
+
@indexer = Indexers::Solr.new self
|
|
49
|
+
@indexer.index
|
|
50
|
+
end
|
|
51
|
+
end
|
|
52
|
+
end
|
data/lib/picky/cores.rb
ADDED
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
Infinity = 1.0/0
|
|
2
|
+
|
|
3
|
+
# Handles processing over multiple cores.
|
|
4
|
+
#
|
|
5
|
+
class Cores
|
|
6
|
+
|
|
7
|
+
# Pass it an ary or generator.
|
|
8
|
+
#
|
|
9
|
+
# generator = (1..10).each
|
|
10
|
+
# forked generator, :max => 5 do |element|
|
|
11
|
+
#
|
|
12
|
+
# end
|
|
13
|
+
#
|
|
14
|
+
# Options include:
|
|
15
|
+
# * max: Maximum # of processors to use. Default is all it can get.
|
|
16
|
+
#
|
|
17
|
+
def self.forked ary_or_generator, options = {}
|
|
18
|
+
ary_or_generator = ary_or_generator.sort_by { rand } if options[:randomly]
|
|
19
|
+
generator = ary_or_generator.each
|
|
20
|
+
|
|
21
|
+
# Get the maximum number of processors.
|
|
22
|
+
#
|
|
23
|
+
max = max_processors options
|
|
24
|
+
currently_processing = 0
|
|
25
|
+
|
|
26
|
+
#
|
|
27
|
+
#
|
|
28
|
+
while generator
|
|
29
|
+
|
|
30
|
+
# Ramp it up to num processors.
|
|
31
|
+
#
|
|
32
|
+
while currently_processing < max
|
|
33
|
+
|
|
34
|
+
currently_processing = currently_processing + 1
|
|
35
|
+
|
|
36
|
+
element = nil
|
|
37
|
+
begin
|
|
38
|
+
element = generator.next
|
|
39
|
+
rescue StopIteration => si
|
|
40
|
+
break
|
|
41
|
+
end
|
|
42
|
+
break unless element
|
|
43
|
+
|
|
44
|
+
Process.fork do
|
|
45
|
+
yield element
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
begin
|
|
51
|
+
Process.wait 0 # Block and wait for any child to finish.
|
|
52
|
+
rescue Errno::ECHILD => e
|
|
53
|
+
break
|
|
54
|
+
ensure
|
|
55
|
+
currently_processing = currently_processing - 1
|
|
56
|
+
end
|
|
57
|
+
end
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
# Return the number of maximum usable processors.
|
|
61
|
+
#
|
|
62
|
+
def self.max_processors options = {}
|
|
63
|
+
options[:amount] || [number_of_cores, (options[:max] || Infinity)].min
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
# Gets the number of cores depending on OS.
|
|
67
|
+
#
|
|
68
|
+
def self.number_of_cores
|
|
69
|
+
extract_cores_for actual_platform
|
|
70
|
+
end
|
|
71
|
+
# Extracts the platform os from the platform.
|
|
72
|
+
#
|
|
73
|
+
# Note: Could also use 'rbconfig'.
|
|
74
|
+
#
|
|
75
|
+
def self.actual_platform
|
|
76
|
+
matched = platform.match(/-\b([a-z]*)/)
|
|
77
|
+
matched && matched[1]
|
|
78
|
+
end
|
|
79
|
+
# Returns a mapping
|
|
80
|
+
# os_name => lambda_which_returns_a_number_of_cores
|
|
81
|
+
#
|
|
82
|
+
@@number_of_cores = {
|
|
83
|
+
'darwin' => lambda { `system_profiler SPHardwareDataType | grep 'Total Number Of Cores'`.gsub(/[^\d]/, '') },
|
|
84
|
+
'linux' => lambda { `grep -ci ^processor /proc/cpuinfo` }
|
|
85
|
+
}
|
|
86
|
+
def self.os_to_core_mapping
|
|
87
|
+
@@number_of_cores
|
|
88
|
+
end
|
|
89
|
+
# Extracts the number of cores for the given os name.
|
|
90
|
+
#
|
|
91
|
+
# Note: Default is 1.
|
|
92
|
+
#
|
|
93
|
+
def self.extract_cores_for os
|
|
94
|
+
code_to_execute = os_to_core_mapping[os]
|
|
95
|
+
code_to_execute && code_to_execute.call.to_i || 1
|
|
96
|
+
end
|
|
97
|
+
def self.platform
|
|
98
|
+
RUBY_PLATFORM
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
end
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
class DB < ActiveRecord::Base
|
|
2
|
+
|
|
3
|
+
self.abstract_class = true
|
|
4
|
+
|
|
5
|
+
#
|
|
6
|
+
#
|
|
7
|
+
def self.configure options = {}
|
|
8
|
+
@connection_options = if filename = options[:file]
|
|
9
|
+
File.open(File.join(SEARCH_ROOT, filename)) { |f| YAML::load(f) }
|
|
10
|
+
else
|
|
11
|
+
options
|
|
12
|
+
end
|
|
13
|
+
self
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
#
|
|
17
|
+
#
|
|
18
|
+
def self.connect
|
|
19
|
+
return if SEARCH_ENVIRONMENT.to_s == 'test'
|
|
20
|
+
establish_connection @connection_options
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
end
|
|
@@ -0,0 +1,339 @@
|
|
|
1
|
+
// Note: This is the Ruby 1.9 version.
|
|
2
|
+
//
|
|
3
|
+
#include "ruby.h"
|
|
4
|
+
|
|
5
|
+
// Copying internal ruby methods.
|
|
6
|
+
//
|
|
7
|
+
static inline VALUE rb_ary_elt(ary, offset)
|
|
8
|
+
VALUE ary;
|
|
9
|
+
long offset;
|
|
10
|
+
{
|
|
11
|
+
if (RARRAY_LEN(ary) == 0) return Qnil;
|
|
12
|
+
if (offset < 0 || RARRAY_LEN(ary) <= offset) {
|
|
13
|
+
return Qnil;
|
|
14
|
+
}
|
|
15
|
+
return RARRAY_PTR(ary)[offset];
|
|
16
|
+
}
|
|
17
|
+
VALUE rb_ary_make_hash(VALUE, VALUE);
|
|
18
|
+
static VALUE ary_make_hash(ary1, ary2)
|
|
19
|
+
VALUE ary1, ary2;
|
|
20
|
+
{
|
|
21
|
+
VALUE hash = rb_hash_new();
|
|
22
|
+
long i;
|
|
23
|
+
|
|
24
|
+
for (i=0; i<RARRAY_LEN(ary1); i++) {
|
|
25
|
+
rb_hash_aset(hash, RARRAY_PTR(ary1)[i], Qtrue);
|
|
26
|
+
}
|
|
27
|
+
if (ary2) {
|
|
28
|
+
for (i=0; i<RARRAY_LEN(ary2); i++) {
|
|
29
|
+
rb_hash_aset(hash, RARRAY_PTR(ary2)[i], Qtrue);
|
|
30
|
+
}
|
|
31
|
+
}
|
|
32
|
+
return hash;
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
// Comparison functions.
|
|
36
|
+
//
|
|
37
|
+
inline int intvaluecmp(VALUE a, VALUE b) {
|
|
38
|
+
return FIX2INT(a) - FIX2INT(b);
|
|
39
|
+
}
|
|
40
|
+
inline int intcmp(const int * a, const int * b) {
|
|
41
|
+
return (*a - *b);
|
|
42
|
+
}
|
|
43
|
+
inline long longcmp(const void * a, const void * b) {
|
|
44
|
+
return (*(long*) a - *(long*) b);
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
// This version just calls the & consecutively for all arrays.
|
|
48
|
+
//
|
|
49
|
+
inline VALUE memory_efficient_intersect(VALUE self, VALUE length_sorted_array_of_arrays) {
|
|
50
|
+
// counters
|
|
51
|
+
long i, j;
|
|
52
|
+
|
|
53
|
+
// structs
|
|
54
|
+
struct RArray *rb_array_of_arrays;
|
|
55
|
+
struct RArray *smallest_array;
|
|
56
|
+
struct RArray *current_array;
|
|
57
|
+
VALUE hash;
|
|
58
|
+
|
|
59
|
+
// temps
|
|
60
|
+
VALUE v, vv;
|
|
61
|
+
|
|
62
|
+
// conversions
|
|
63
|
+
rb_array_of_arrays = RARRAY(length_sorted_array_of_arrays);
|
|
64
|
+
smallest_array = RARRAY(rb_ary_dup(RARRAY_PTR(rb_array_of_arrays)[0]));
|
|
65
|
+
|
|
66
|
+
// iterate through all arrays
|
|
67
|
+
for (i = 1; i < RARRAY_LEN(rb_array_of_arrays); i++) {
|
|
68
|
+
// Break if the smallest array is empty
|
|
69
|
+
if (RARRAY_LEN(smallest_array) == 0) {
|
|
70
|
+
break;
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
// make a hash from the currently smallest version
|
|
74
|
+
hash = ary_make_hash(smallest_array, 0);
|
|
75
|
+
// clear for use as temp array
|
|
76
|
+
rb_ary_clear(smallest_array);
|
|
77
|
+
|
|
78
|
+
current_array = RARRAY_PTR(rb_array_of_arrays)[i];
|
|
79
|
+
// iterate through all array elements
|
|
80
|
+
for (j = 0; j < RARRAY_LEN(current_array); j++) {
|
|
81
|
+
v = vv = rb_ary_elt(current_array, j);
|
|
82
|
+
if (st_delete(RHASH_TBL(hash), (unsigned long*)&vv, 0)) {
|
|
83
|
+
rb_ary_push(smallest_array, v);
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
return smallest_array;
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
// Brute force algorithm to find the intersection of an array of length sorted, unsorted arrays.
|
|
92
|
+
// This algorithm can be faster than others for small arrays.
|
|
93
|
+
//
|
|
94
|
+
// inline VALUE brute_force_intersect(VALUE self, VALUE length_sorted_array_of_arrays) {
|
|
95
|
+
// // counters
|
|
96
|
+
// long i, j, k;
|
|
97
|
+
//
|
|
98
|
+
// // structs
|
|
99
|
+
// struct RArray *rb_array_of_arrays;
|
|
100
|
+
// struct RArray *candidate_answer_set;
|
|
101
|
+
// struct RArray *current_set;
|
|
102
|
+
//
|
|
103
|
+
// // conversions
|
|
104
|
+
// rb_array_of_arrays = RARRAY(length_sorted_array_of_arrays);
|
|
105
|
+
//
|
|
106
|
+
// // temps
|
|
107
|
+
// VALUE e;
|
|
108
|
+
// unsigned char found;
|
|
109
|
+
//
|
|
110
|
+
// // Let the smallest set s[0] be the candidate answer set
|
|
111
|
+
// // Note: Need a duplicate
|
|
112
|
+
// candidate_answer_set = RARRAY(rb_ary_dup(rb_array_of_arrays->ptr[0]));
|
|
113
|
+
//
|
|
114
|
+
// // For each entry in candidate anser set
|
|
115
|
+
// // Get current value
|
|
116
|
+
// for(i = 0; i < candidate_answer_set->len; i++) {
|
|
117
|
+
// e = candidate_answer_set->ptr[i];
|
|
118
|
+
//
|
|
119
|
+
// // Find the current value in other arrays
|
|
120
|
+
// // if not found, break
|
|
121
|
+
// for(j = 1; j < rb_array_of_arrays->len; j++) {
|
|
122
|
+
// current_set = RARRAY(rb_array_of_arrays->ptr[j]);
|
|
123
|
+
// found = 0;
|
|
124
|
+
//
|
|
125
|
+
// // Find with a linear search
|
|
126
|
+
// for(k = 0; k < current_set->len; k++) {
|
|
127
|
+
// if (e == current_set->ptr[k]) {
|
|
128
|
+
// found = 1;
|
|
129
|
+
// break;
|
|
130
|
+
// }
|
|
131
|
+
// }
|
|
132
|
+
//
|
|
133
|
+
// // break if not found
|
|
134
|
+
// if (!found) {
|
|
135
|
+
// break;
|
|
136
|
+
// }
|
|
137
|
+
// }
|
|
138
|
+
//
|
|
139
|
+
// // remove from candidate answer set if not found
|
|
140
|
+
// if (!found) {
|
|
141
|
+
// candidate_answer_set->ptr[i] = Qnil;
|
|
142
|
+
// }
|
|
143
|
+
// }
|
|
144
|
+
//
|
|
145
|
+
// // compact the candidate answer set
|
|
146
|
+
// // rb_ary_compact_bang(candidate_answer_set);
|
|
147
|
+
// rb_funcall(candidate_answer_set, rb_intern("compact!"), 0);
|
|
148
|
+
//
|
|
149
|
+
// return candidate_answer_set;
|
|
150
|
+
// }
|
|
151
|
+
|
|
152
|
+
// inline VALUE intersect_unique(VALUE self, VALUE length_sorted_array_of_arrays) {
|
|
153
|
+
// // VALUE length_sorted_array_of_arrays = (_length_sorted_array_of_arrays);
|
|
154
|
+
//
|
|
155
|
+
// // structs
|
|
156
|
+
// struct RArray *result;
|
|
157
|
+
// struct RArray *rb_array_of_arrays;
|
|
158
|
+
//
|
|
159
|
+
// // conversions
|
|
160
|
+
// rb_array_of_arrays = RARRAY(length_sorted_array_of_arrays);
|
|
161
|
+
//
|
|
162
|
+
// // TODO
|
|
163
|
+
//
|
|
164
|
+
// return result;
|
|
165
|
+
// }
|
|
166
|
+
|
|
167
|
+
// Generates the intersection of multiple
|
|
168
|
+
//
|
|
169
|
+
// inline VALUE sorting_intersect_multiple(VALUE self, VALUE length_sorted_array_of_arrays) {
|
|
170
|
+
// // TODO
|
|
171
|
+
// }
|
|
172
|
+
|
|
173
|
+
// Generates the intersection of multiple length sorted, sorted arrays
|
|
174
|
+
//
|
|
175
|
+
// inline VALUE intersect_multiple_sorted(VALUE self, VALUE _length_sorted_array_of_arrays) {
|
|
176
|
+
// VALUE length_sorted_array_of_arrays = (_length_sorted_array_of_arrays);
|
|
177
|
+
//
|
|
178
|
+
// // counters
|
|
179
|
+
// long i, j;
|
|
180
|
+
// long current_set_position, current_answer_set_position;
|
|
181
|
+
//
|
|
182
|
+
// // structs
|
|
183
|
+
// struct RArray *rb_array_of_arrays;
|
|
184
|
+
// struct RArray *candidate_answer_set;
|
|
185
|
+
// struct RArray *current_set;
|
|
186
|
+
//
|
|
187
|
+
// // temps
|
|
188
|
+
// long e;
|
|
189
|
+
//
|
|
190
|
+
// // conversions
|
|
191
|
+
// rb_array_of_arrays = RARRAY(length_sorted_array_of_arrays);
|
|
192
|
+
//
|
|
193
|
+
// // Let the smallest set s[0] be the candidate answer set
|
|
194
|
+
// // Note: Need a duplicate
|
|
195
|
+
// candidate_answer_set = RARRAY(rb_ary_dup(rb_array_of_arrays->ptr[0]));
|
|
196
|
+
//
|
|
197
|
+
// // For each set s[i], i = 1 .. k do
|
|
198
|
+
// for(i = 1; i < rb_array_of_arrays->len; i++) {
|
|
199
|
+
// current_set = RARRAY(rb_array_of_arrays->ptr[i]);
|
|
200
|
+
// current_set_position = 0;
|
|
201
|
+
//
|
|
202
|
+
// // for each element e in the candidate answer set
|
|
203
|
+
// for(j = 0; j < candidate_answer_set->len; j++) {
|
|
204
|
+
// e = candidate_answer_set->ptr[j];
|
|
205
|
+
//
|
|
206
|
+
// // search for e in the range l[i] to size(s[i])
|
|
207
|
+
// // and update l[i] to the last position probed in the previous step
|
|
208
|
+
// // if e was not found then
|
|
209
|
+
// if (bsearch(
|
|
210
|
+
// &e,
|
|
211
|
+
// ¤t_set->ptr[current_set_position],
|
|
212
|
+
// (current_set->len - current_set_position),
|
|
213
|
+
// sizeof(VALUE), //sizeof(current_set->ptr[0]),
|
|
214
|
+
// intcmp //longcmp
|
|
215
|
+
// ) == NULL) {
|
|
216
|
+
//
|
|
217
|
+
// // remove e from the candidate answer set
|
|
218
|
+
// // and advance e to the next element in the answer set
|
|
219
|
+
// // rb_ary_delete_at(candidate_answer_set, j);
|
|
220
|
+
// candidate_answer_set->ptr[j] = Qnil;
|
|
221
|
+
// }
|
|
222
|
+
// current_set_position = j - 1;
|
|
223
|
+
// }
|
|
224
|
+
//
|
|
225
|
+
// // compact the candidate answer set
|
|
226
|
+
// // rb_ary_compact_bang(candidate_answer_set);
|
|
227
|
+
// rb_funcall(candidate_answer_set, rb_intern("compact!"), 0);
|
|
228
|
+
// }
|
|
229
|
+
//
|
|
230
|
+
// return candidate_answer_set;
|
|
231
|
+
// }
|
|
232
|
+
|
|
233
|
+
// Trying to make a custom version of Matz' ary &
|
|
234
|
+
//
|
|
235
|
+
// Differences:
|
|
236
|
+
// * Multiple arrays
|
|
237
|
+
// * No to_ary
|
|
238
|
+
// * Smallest array is used to make hash
|
|
239
|
+
// Note: Assumes that whatever is given in as array of arrays is sorted by array sizes.
|
|
240
|
+
//
|
|
241
|
+
// static VALUE rb_ary_and(ary1, ary2) VALUE ary1, ary2; {
|
|
242
|
+
// static VALUE intersect_multiple_with_hash(VALUE self, VALUE _length_sorted_array_of_arrays) {
|
|
243
|
+
// // VALUE hash, ary3, v, vv;
|
|
244
|
+
// // long i;
|
|
245
|
+
// //
|
|
246
|
+
// // ary2 = to_ary(ary2);
|
|
247
|
+
// // ary3 = rb_ary_new2(RARRAY(ary1)->len < RARRAY(ary2)->len ?
|
|
248
|
+
// // RARRAY(ary1)->len : RARRAY(ary2)->len);
|
|
249
|
+
// // hash = ary_make_hash(ary2, 0);
|
|
250
|
+
// //
|
|
251
|
+
// // for (i=0; i<RARRAY(ary1)->len; i++) {
|
|
252
|
+
// // v = vv = rb_ary_elt(ary1, i);
|
|
253
|
+
// // if (st_delete(RHASH(hash)->tbl, (st_data_t*)&vv, 0)) {
|
|
254
|
+
// // rb_ary_push(ary3, v);
|
|
255
|
+
// // }
|
|
256
|
+
// // }
|
|
257
|
+
// //
|
|
258
|
+
// // return ary3;
|
|
259
|
+
// VALUE length_sorted_array_of_arrays = (_length_sorted_array_of_arrays);
|
|
260
|
+
//
|
|
261
|
+
// // structs
|
|
262
|
+
// struct RArray *candidate_answer_set;
|
|
263
|
+
// struct RArray *current_set;
|
|
264
|
+
//
|
|
265
|
+
// // temps
|
|
266
|
+
// VALUE hash, v, vv;
|
|
267
|
+
// long i, j, k;
|
|
268
|
+
//
|
|
269
|
+
// // Get smallest array size
|
|
270
|
+
// candidate_answer_set = rb_ary_new2((RARRAY(rb_array_of_arrays->ptr[0])->len);
|
|
271
|
+
//
|
|
272
|
+
// hash = ary_make_hash(RARRAY(rb_array_of_arrays->ptr[0]), 0);
|
|
273
|
+
//
|
|
274
|
+
// // For each entry in candidate answer set
|
|
275
|
+
// // Get current value
|
|
276
|
+
// for(i = 0; i < candidate_answer_set->len; i++) {
|
|
277
|
+
// // e = candidate_answer_set->ptr[i];
|
|
278
|
+
// v = vv = rb_ary_elt(candidate_answer_set, i);
|
|
279
|
+
//
|
|
280
|
+
// // Find the current value in other arrays
|
|
281
|
+
// // if not found, break
|
|
282
|
+
// for(j = 1; j < rb_array_of_arrays->len; j++) {
|
|
283
|
+
// current_set = RARRAY(rb_array_of_arrays->ptr[j]);
|
|
284
|
+
// found = 0;
|
|
285
|
+
//
|
|
286
|
+
// // Find with a linear search
|
|
287
|
+
// for(k = 0; k < current_set->len; k++) {
|
|
288
|
+
// // if (e == current_set->ptr[k]) {
|
|
289
|
+
// if (st_delete(RHASH(hash)->tbl, (unsigned long*)&vv, 0))
|
|
290
|
+
// found = 1;
|
|
291
|
+
// break;
|
|
292
|
+
// }
|
|
293
|
+
// }
|
|
294
|
+
//
|
|
295
|
+
// // break if not found
|
|
296
|
+
// if (!found) {
|
|
297
|
+
// break;
|
|
298
|
+
// }
|
|
299
|
+
// }
|
|
300
|
+
//
|
|
301
|
+
// // remove from candidate answer set if not found
|
|
302
|
+
// if (!found) {
|
|
303
|
+
// rb_ary_push(result, v);
|
|
304
|
+
// // candidate_answer_set->ptr[i] = Qnil;
|
|
305
|
+
// }
|
|
306
|
+
// }
|
|
307
|
+
//
|
|
308
|
+
// // compact the candidate answer set
|
|
309
|
+
// // rb_ary_compact_bang(candidate_answer_set);
|
|
310
|
+
// rb_funcall(candidate_answer_set, rb_intern("compact!"), 0);
|
|
311
|
+
//
|
|
312
|
+
// return candidate_answer_set;
|
|
313
|
+
// }
|
|
314
|
+
|
|
315
|
+
// VALUE rb_ary_clear_bang(ary) VALUE ary; {
|
|
316
|
+
// rb_ary_modify(ary);
|
|
317
|
+
// ARY_SET_LEN(ary, 0);
|
|
318
|
+
// // capa stays the same
|
|
319
|
+
// // if (ARY_DEFAULT_SIZE * 2 < RARRAY(ary)->aux.capa) {
|
|
320
|
+
// // REALLOC_N(RARRAY(ary)->ptr, VALUE, ARY_DEFAULT_SIZE * 2);
|
|
321
|
+
// // RARRAY(ary)->aux.capa = ARY_DEFAULT_SIZE * 2;
|
|
322
|
+
// // }
|
|
323
|
+
// return ary;
|
|
324
|
+
// }
|
|
325
|
+
|
|
326
|
+
VALUE p_mPerformant, p_cArray;
|
|
327
|
+
|
|
328
|
+
void Init_performant() {
|
|
329
|
+
p_mPerformant = rb_define_module("Performant");
|
|
330
|
+
p_cArray = rb_define_class_under(p_mPerformant, "Array", rb_cObject);
|
|
331
|
+
// p_cArray = rb_define_module_under(p_mPerformant, "Array");
|
|
332
|
+
|
|
333
|
+
// rb_define_method(rb_cArray, "clear!", rb_ary_clear_bang, 0);
|
|
334
|
+
|
|
335
|
+
rb_define_singleton_method(p_cArray, "memory_efficient_intersect", memory_efficient_intersect, 1);
|
|
336
|
+
// rb_define_singleton_method(p_cArray, "brute_force_intersect", brute_force_intersect, 1);
|
|
337
|
+
// rb_define_singleton_method(p_cArray, "intersect_multiple_sorted", intersect_multiple_sorted, 1);
|
|
338
|
+
// rb_define_singleton_method(p_cArray, "intersect_multiple_with_hash", intersect_multiple_sorted_with_hash, 1);
|
|
339
|
+
}
|