tantiny-in-memory 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,165 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Tantiny
4
+ class Index
5
+ LOCKFILE = ".tantiny.lock"
6
+ DEFAULT_WRITER_MEMORY = 5_000_000 # 5MB
7
+ DEFAULT_LIMIT = 10
8
+
9
+ def self.new(path, **options, &block)
10
+ default_tokenizer = options[:tokenizer] || Tokenizer.default
11
+ schema = Schema.new(default_tokenizer, &block)
12
+
13
+ object = __new(
14
+ path.to_s,
15
+ schema.default_tokenizer,
16
+ schema.field_tokenizers.transform_keys(&:to_s),
17
+ schema.text_fields.map(&:to_s),
18
+ schema.string_fields.map(&:to_s),
19
+ schema.integer_fields.map(&:to_s),
20
+ schema.double_fields.map(&:to_s),
21
+ schema.date_fields.map(&:to_s),
22
+ schema.facet_fields.map(&:to_s)
23
+ )
24
+
25
+ object.send(:initialize, path, schema, **options)
26
+
27
+ object
28
+ end
29
+
30
+ def initialize(path, schema, **options)
31
+ @path = path
32
+ @schema = schema
33
+
34
+ @indexer_memory = options[:writer_memory] || DEFAULT_WRITER_MEMORY
35
+ @exclusive_writer = options[:exclusive_writer] || false
36
+
37
+ @active_transaction = Concurrent::ThreadLocalVar.new(false)
38
+ @transaction_semaphore = Mutex.new
39
+
40
+ acquire_index_writer if exclusive_writer?
41
+ end
42
+
43
+ attr_reader :schema
44
+
45
+ def transaction
46
+ if inside_transaction?
47
+ yield
48
+ else
49
+ synchronize do
50
+ open_transaction!
51
+
52
+ yield
53
+
54
+ close_transaction!
55
+ end
56
+ end
57
+
58
+ nil
59
+ end
60
+
61
+ def reload
62
+ __reload
63
+ end
64
+
65
+ def <<(document)
66
+ transaction do
67
+ __add_document(
68
+ resolve(document, schema.id_field).to_s,
69
+ slice_document(document, schema.text_fields) { |v| v.to_s },
70
+ slice_document(document, schema.string_fields) { |v| v.to_s },
71
+ slice_document(document, schema.integer_fields) { |v| v.to_i },
72
+ slice_document(document, schema.double_fields) { |v| v.to_f },
73
+ slice_document(document, schema.date_fields) { |v| Helpers.timestamp(v) },
74
+ slice_document(document, schema.facet_fields) { |v| v.to_s }
75
+ )
76
+ end
77
+ end
78
+
79
+ def delete(id)
80
+ transaction do
81
+ __delete_document(id.to_s)
82
+ end
83
+ end
84
+
85
+ def search(query, limit: DEFAULT_LIMIT, **smart_query_options)
86
+ unless query.is_a?(Query)
87
+ fields = schema.text_fields
88
+ query = Query.smart_query(self, fields, query.to_s, **smart_query_options)
89
+ end
90
+
91
+ __search(query, limit)
92
+ end
93
+
94
+ # Shortcuts for creating queries:
95
+ Query::TYPES.each do |query_type|
96
+ method_name = "#{query_type}_query"
97
+ define_method(method_name) do |*args, **kwargs|
98
+ Query.send(method_name, self, *args, **kwargs)
99
+ end
100
+ end
101
+
102
+ private
103
+
104
+ def slice_document(document, fields, &block)
105
+ fields.inject({}) do |hash, field|
106
+ hash.tap { |h| h[field.to_s] = resolve(document, field) }
107
+ end.compact.transform_values(&block)
108
+ end
109
+
110
+ def resolve(document, field)
111
+ document.is_a?(Hash) ? document[field] : document.send(field)
112
+ end
113
+
114
+ def acquire_index_writer
115
+ __acquire_index_writer(@indexer_memory)
116
+ rescue TantivyError => e
117
+ case e.message
118
+ when /Failed to acquire Lockfile/
119
+ raise IndexWriterBusyError.new
120
+ else
121
+ raise
122
+ end
123
+ end
124
+
125
+ def release_index_writer
126
+ __release_index_writer
127
+ end
128
+
129
+ def commit
130
+ __commit
131
+ end
132
+
133
+ def open_transaction!
134
+ acquire_index_writer unless exclusive_writer?
135
+
136
+ @active_transaction.value = true
137
+ end
138
+
139
+ def close_transaction!
140
+ commit
141
+
142
+ release_index_writer unless exclusive_writer?
143
+
144
+ @active_transaction.value = false
145
+ end
146
+
147
+ def inside_transaction?
148
+ @active_transaction.value
149
+ end
150
+
151
+ def exclusive_writer?
152
+ @exclusive_writer
153
+ end
154
+
155
+ def synchronize(&block)
156
+ @transaction_semaphore.synchronize do
157
+ Helpers.with_lock(lockfile_path, &block)
158
+ end
159
+ end
160
+
161
+ def lockfile_path
162
+ @lockfile_path ||= File.join(@path, LOCKFILE)
163
+ end
164
+ end
165
+ end
@@ -0,0 +1,165 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "date"
4
+
5
+ module Tantiny
6
+ class Query
7
+ TYPES = %i[
8
+ all empty term fuzzy_term
9
+ phrase regex range facet
10
+ smart prefix
11
+ ].freeze
12
+
13
+ DEFAULT_BOOST = 1.0
14
+ DEFAULT_FUZZY_DISTANCE = 1
15
+
16
+ class << self
17
+ def conjunction(*queries)
18
+ # @type var queries: Array[untyped]
19
+ queries.one? ? queries.first : __conjunction(queries)
20
+ end
21
+
22
+ def disjunction(*queries)
23
+ # @type var queries: Array[untyped]
24
+ queries.one? ? queries.first : __disjunction(queries)
25
+ end
26
+
27
+ def all_query(_index = nil)
28
+ __new_all_query
29
+ end
30
+
31
+ def empty_query(_index = nil)
32
+ __new_empty_query
33
+ end
34
+
35
+ def term_query(index, fields, term, **options)
36
+ allowed_fields = text_and_strings(index)
37
+ construct_query(index, :term, allowed_fields, fields, [term.to_s], **options)
38
+ end
39
+
40
+ def fuzzy_term_query(index, fields, term, distance = DEFAULT_FUZZY_DISTANCE, **options)
41
+ params = [term.to_s, distance.to_i]
42
+ allowed_fields = text_and_strings(index)
43
+ construct_query(index, :fuzzy_term, allowed_fields, fields, params, **options)
44
+ end
45
+
46
+ def phrase_query(index, fields, phrase, **options)
47
+ queries = [*fields].map do |f|
48
+ terms = index.schema.tokenizer_for(f).terms(phrase)
49
+ allowed_fields = index.schema.text_fields
50
+ construct_query(index, :phrase, allowed_fields, f, [terms], **options)
51
+ end
52
+
53
+ queries.empty? ? empty_query : disjunction(*queries)
54
+ end
55
+
56
+ def regex_query(index, fields, regex, **options)
57
+ allowed_fields = text_and_strings(index)
58
+ construct_query(index, :regex, allowed_fields, fields, [regex.to_s], **options)
59
+ end
60
+
61
+ def prefix_query(index, fields, prefix, **options)
62
+ regex_query(index, fields, Regexp.escape(prefix) + ".*", **options)
63
+ end
64
+
65
+ def range_query(index, fields, range, **options)
66
+ schema = index.schema
67
+
68
+ case range.first
69
+ when Integer
70
+ allowed_fields = schema.integer_fields
71
+ from, to = [range.min, range.max]
72
+ when Float
73
+ allowed_fields = schema.double_fields
74
+ from, to = [range.first, range.last]
75
+ when Date, DateTime
76
+ # @type var range: Range[Date | DateTime]
77
+ allowed_fields = schema.date_fields
78
+ from, to = [Helpers.timestamp(range.first), Helpers.timestamp(range.last)]
79
+ else
80
+ raise UnsupportedRange.new(range.first.class)
81
+ end
82
+
83
+ # @type var allowed_fields: Array[Symbol]
84
+ construct_query(index, :range, allowed_fields, fields, [from, to], **options)
85
+ end
86
+
87
+ def facet_query(index, field, path, **options)
88
+ allowed_fields = index.schema.facet_fields
89
+ construct_query(index, :facet, allowed_fields, field, [path], **options)
90
+ end
91
+
92
+ def smart_query(index, fields, query_string, **options)
93
+ fuzzy_distance = options[:fuzzy_distance]
94
+ boost_factor = options.fetch(:boost, DEFAULT_BOOST)
95
+
96
+ field_queries = [*fields].map do |field|
97
+ terms = index.schema.tokenizer_for(field).terms(query_string)
98
+
99
+ # See: https://github.com/soutaro/steep/issues/272
100
+ # @type block: nil | Query
101
+ next if terms.empty?
102
+
103
+ term_queries = terms.map do |term|
104
+ if fuzzy_distance.nil?
105
+ term_query(index, field, term)
106
+ else
107
+ fuzzy_term_query(index, field, term, fuzzy_distance)
108
+ end
109
+ end
110
+
111
+ # @type var terms: untyped
112
+ # @type var term_queries: untyped
113
+ last_term_query = prefix_query(index, field, terms.last) | term_queries.last
114
+
115
+ conjunction(last_term_query, *term_queries[0...-1])
116
+ end.compact
117
+
118
+ disjunction(*field_queries).boost(boost_factor)
119
+ end
120
+
121
+ private
122
+
123
+ # Can't use variadic argument `params` here due to:
124
+ # https://github.com/soutaro/steep/issues/480
125
+ def construct_query(index, query_type, allowed_fields, fields, params, **options)
126
+ queries = [*fields].map do |field|
127
+ supported = allowed_fields.include?(field)
128
+ raise UnsupportedField.new(field) unless supported
129
+
130
+ send("__new_#{query_type}_query", index, field.to_s, *params)
131
+ end
132
+
133
+ return empty_query if fields.empty?
134
+
135
+ disjunction(*queries).boost(options.fetch(:boost, DEFAULT_BOOST))
136
+ end
137
+
138
+ def text_and_strings(index)
139
+ index.schema.text_fields | index.schema.string_fields
140
+ end
141
+ end
142
+
143
+ def |(other)
144
+ raise ArgumentError.new("Not a #{self.class}.") unless other.is_a?(self.class)
145
+
146
+ self.class.disjunction(self, other)
147
+ end
148
+
149
+ def &(other)
150
+ raise ArgumentError.new("Not a #{self.class}.") unless other.is_a?(self.class)
151
+
152
+ self.class.conjunction(self, other)
153
+ end
154
+
155
+ def !
156
+ __negation
157
+ end
158
+
159
+ def boost(boost_factor)
160
+ return self if boost_factor == DEFAULT_BOOST
161
+
162
+ __boost(boost_factor.to_f)
163
+ end
164
+ end
165
+ end
@@ -0,0 +1,53 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Tantiny
4
+ class Schema
5
+ attr_reader :default_tokenizer,
6
+ :id_field,
7
+ :text_fields,
8
+ :string_fields,
9
+ :integer_fields,
10
+ :double_fields,
11
+ :date_fields,
12
+ :facet_fields,
13
+ :field_tokenizers
14
+
15
+ def initialize(tokenizer, &block)
16
+ @default_tokenizer = tokenizer
17
+ @id_field = :id
18
+ @text_fields = []
19
+ @string_fields = []
20
+ @integer_fields = []
21
+ @double_fields = []
22
+ @date_fields = []
23
+ @facet_fields = []
24
+ @field_tokenizers = {}
25
+
26
+ instance_exec(&block)
27
+ end
28
+
29
+ def tokenizer_for(field)
30
+ field_tokenizers[field] || default_tokenizer
31
+ end
32
+
33
+ private
34
+
35
+ def id(key) = @id_field = key
36
+
37
+ def string(key) = @string_fields << key
38
+
39
+ def integer(key) = @integer_fields << key
40
+
41
+ def double(key) = @double_fields << key
42
+
43
+ def date(key) = @date_fields << key
44
+
45
+ def facet(key) = @facet_fields << key
46
+
47
+ def text(key, tokenizer: nil)
48
+ @field_tokenizers[key] = tokenizer if tokenizer
49
+
50
+ @text_fields << key
51
+ end
52
+ end
53
+ end
@@ -0,0 +1,28 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Tantiny
4
+ class Tokenizer
5
+ def self.default
6
+ new(:simple)
7
+ end
8
+
9
+ def self.new(kind, **options)
10
+ case kind
11
+ when :simple
12
+ __new_simple_tokenizer
13
+ when :stemmer
14
+ language = options[:language] || :en
15
+ __new_stemmer_tokenizer(language.to_s)
16
+ when :ngram
17
+ prefix_only = options.fetch(:prefix_only, false)
18
+ __new_ngram_tokenizer(options[:min], options[:max], prefix_only)
19
+ else
20
+ raise UnknownTokenizer.new(kind)
21
+ end
22
+ end
23
+
24
+ def terms(string)
25
+ __extract_terms(string)
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Tantiny
4
+ VERSION = "1.0.0" # {x-release-please-version}
5
+ end
data/lib/tantiny.rb ADDED
@@ -0,0 +1,27 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "ruby-next/language/setup"
4
+ RubyNext::Language.setup_gem_load_path
5
+
6
+ require "rutie"
7
+ require "thermite/fiddle"
8
+ require "concurrent"
9
+ require "fileutils"
10
+
11
+ require "tantiny/version"
12
+ require "tantiny/errors"
13
+ require "tantiny/helpers"
14
+ require "tantiny/schema"
15
+ require "tantiny/tokenizer"
16
+ require "tantiny/query"
17
+ require "tantiny/index"
18
+
19
+ module Tantiny
20
+ project_dir = File.expand_path("../..", __FILE__)
21
+
22
+ Thermite::Fiddle.load_module(
23
+ "Init_tantiny",
24
+ cargo_project_path: project_dir,
25
+ ruby_project_path: project_dir
26
+ )
27
+ end
data/lib/tantiny.so ADDED
Binary file
@@ -0,0 +1,20 @@
1
+ module Tantiny
2
+ class TantivyError < StandardError
3
+ end
4
+
5
+ class UnknownField < StandardError
6
+ def initialize: () -> void
7
+ end
8
+
9
+ class UnknownTokenizer < StandardError
10
+ def initialize: (Symbol tokenizer_type) -> void
11
+ end
12
+
13
+ class UnsupportedRange < StandardError
14
+ def initialize: (Class range_type) -> void
15
+ end
16
+
17
+ class UnsupportedField < StandardError
18
+ def initialize: (Symbol field) -> void
19
+ end
20
+ end
@@ -0,0 +1,8 @@
1
+
2
+ module Tantiny
3
+ module Helpers
4
+ def self.timestamp: ((Date | DateTime) date) -> String
5
+
6
+ def self.with_lock: (String lockfile) { (*untyped) -> void } -> void
7
+ end
8
+ end
@@ -0,0 +1,103 @@
1
+ module Tantiny
2
+ class Index
3
+ LOCKFILE: String
4
+ DEFAULT_WRITER_MEMORY: Integer
5
+ DEFAULT_LIMIT: Integer
6
+
7
+ def self.new: (
8
+ String path,
9
+ **untyped options
10
+ ) { (*untyped) -> void } -> Index
11
+
12
+ def self.__new: (
13
+ String path,
14
+ Tokenizer default_tokenizer,
15
+ Hash[String, Tokenizer] field_tokenizers,
16
+ Array[String] text_fields,
17
+ Array[String] string_fields,
18
+ Array[String] integer_fields,
19
+ Array[String] double_fields,
20
+ Array[String] date_fields,
21
+ Array[String] facet_fields
22
+ ) -> Index
23
+
24
+ def initialize: (
25
+ String path,
26
+ Schema schema,
27
+ **untyped options
28
+ ) -> void
29
+
30
+ attr_reader schema: Schema
31
+
32
+ def transaction: () { (*untyped) -> void } -> void
33
+
34
+ def reload: () -> void
35
+ def <<: (untyped document) -> void
36
+ def delete: (String id) -> void
37
+
38
+ def search: (
39
+ (Query | String) query,
40
+ ?limit: Integer,
41
+ **untyped smart_query_options
42
+ ) -> Array[String]
43
+
44
+ def all_query: () -> Query
45
+ def empty_query: () -> Query
46
+ def term_query: (fields fields, String term, **untyped options) -> Query
47
+ def fuzzy_term_query: (fields fields, String term, ?Integer distance, **untyped options) -> Query
48
+ def phrase_query: (fields fields, String phrase, **untyped options) -> Query
49
+ def regex_query: (fields fields, String regex, **untyped options) -> Query
50
+ def prefix_query: (fields fields, String prefix, **untyped options) -> Query
51
+ def facet_query: (Symbol field, String path, **untyped options) -> Query
52
+ def range_query: (fields fields, Range[numeric | date] range, **untyped options) -> Query
53
+ def smart_query: (fields fields, String query_string, **untyped options) -> Query
54
+
55
+ def __commit: () -> void
56
+ def __reload: () -> void
57
+
58
+ def __add_document: (
59
+ String id,
60
+ Hash[String, String] text_fields,
61
+ Hash[String, String] string_fields,
62
+ Hash[String, Integer] integer_fields,
63
+ Hash[String, Float] double_fields,
64
+ Hash[String, String] date_fields,
65
+ Hash[String, String] facet_fields
66
+ ) -> void
67
+
68
+ def __delete_document: (String id) -> void
69
+
70
+ def __search: (Query query, Integer limit) -> Array[String]
71
+
72
+ def __acquire_index_writer: (Integer overall_memory) -> void
73
+ def __release_index_writer: () -> void
74
+
75
+ private
76
+
77
+ def commit: () -> void
78
+
79
+ def slice_document: (
80
+ untyped document,
81
+ Array[Symbol] fields
82
+ ) { (untyped v) -> untyped } -> Hash[String, untyped]
83
+
84
+ def default_search: (
85
+ String query_string,
86
+ Integer limit,
87
+ ?fuzzy_distance: Integer
88
+ ) -> Array[String]
89
+
90
+ def resolve: (untyped document, Symbol field) -> untyped
91
+
92
+ def synchronize: () { (*untyped) -> void } -> void
93
+ def lockfile_path: () -> String
94
+
95
+ def exclusive_writer?: () -> bool
96
+ def acquire_index_writer: () -> void
97
+ def release_index_writer: () -> void
98
+
99
+ def open_transaction!: () -> void
100
+ def close_transaction!: () -> void
101
+ def inside_transaction?: () -> bool
102
+ end
103
+ end