gitlab-elasticsearch-git 0.0.6
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +11 -0
- data/CHANGELOG +9 -0
- data/Gemfile +15 -0
- data/Gemfile.lock +97 -0
- data/LICENSE.txt +22 -0
- data/README.md +345 -0
- data/Rakefile +10 -0
- data/gitlab-elasticsearch-git.gemspec +28 -0
- data/lib/elasticsearch/git.rb +9 -0
- data/lib/elasticsearch/git/encoder_helper.rb +43 -0
- data/lib/elasticsearch/git/lite_blob.rb +25 -0
- data/lib/elasticsearch/git/model.rb +92 -0
- data/lib/elasticsearch/git/repository.rb +570 -0
- data/lib/elasticsearch/git/version.rb +5 -0
- data/test/lib/repository_test.rb +43 -0
- data/test/support/repo_info.rb +3 -0
- data/test/support/repository.rb +9 -0
- data/test/support/seed_helper.rb +6 -0
- data/test/test_helper.rb +29 -0
- metadata +169 -0
data/Rakefile
ADDED
@@ -0,0 +1,28 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'elasticsearch/git/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "gitlab-elasticsearch-git"
|
8
|
+
spec.version = Elasticsearch::Git::VERSION
|
9
|
+
spec.authors = ["Andrey Kumanyaev", "Evgeniy Sokovikov", "GitLab B.V."]
|
10
|
+
spec.email = ["me@zzet.org", "skv-headless@yandex.ru"]
|
11
|
+
spec.summary = %q{Elasticsearch integrations for git repositories.}
|
12
|
+
spec.description = %q{Elasticsearch integrations for indexing git repositories.}
|
13
|
+
spec.homepage = "https://gitlab.com/gitlab-org/gitlab-elasticsearch-git"
|
14
|
+
spec.license = "MIT"
|
15
|
+
|
16
|
+
spec.files = `git ls-files -z`.split("\x0")
|
17
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
18
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
19
|
+
spec.require_paths = ["lib"]
|
20
|
+
|
21
|
+
spec.add_runtime_dependency 'elasticsearch-model'
|
22
|
+
spec.add_runtime_dependency 'elasticsearch-api', '~> 0.4.0'
|
23
|
+
spec.add_runtime_dependency 'rugged', '~> 0.23.3'
|
24
|
+
spec.add_runtime_dependency 'charlock_holmes', '~> 0.7.3'
|
25
|
+
spec.add_runtime_dependency 'github-linguist', '~> 4.7.0'
|
26
|
+
spec.add_runtime_dependency 'activemodel', '~> 4.2.0'
|
27
|
+
spec.add_runtime_dependency 'activesupport', '~> 4.2.0'
|
28
|
+
end
|
@@ -0,0 +1,43 @@
|
|
1
|
+
require 'active_support/concern'
|
2
|
+
require 'charlock_holmes'
|
3
|
+
|
4
|
+
module Elasticsearch
|
5
|
+
module Git
|
6
|
+
module EncoderHelper
|
7
|
+
extend ActiveSupport::Concern
|
8
|
+
|
9
|
+
included do
|
10
|
+
def encode!(message)
|
11
|
+
return nil unless message.respond_to? :force_encoding
|
12
|
+
|
13
|
+
# if message is utf-8 encoding, just return it
|
14
|
+
message.force_encoding("UTF-8")
|
15
|
+
return message if message.valid_encoding?
|
16
|
+
|
17
|
+
# return message if message type is binary
|
18
|
+
detect = CharlockHolmes::EncodingDetector.detect(message)
|
19
|
+
return message.force_encoding("BINARY") if detect && detect[:type] == :binary
|
20
|
+
|
21
|
+
# encoding message to detect encoding
|
22
|
+
if detect && detect[:encoding]
|
23
|
+
message.force_encoding(detect[:encoding])
|
24
|
+
end
|
25
|
+
|
26
|
+
# encode and clean the bad chars
|
27
|
+
message.replace clean(message)
|
28
|
+
rescue
|
29
|
+
encoding = detect ? detect[:encoding] : "unknown"
|
30
|
+
"--broken encoding: #{encoding}"
|
31
|
+
end
|
32
|
+
|
33
|
+
private
|
34
|
+
|
35
|
+
def clean(message)
|
36
|
+
message.encode("UTF-16BE", undef: :replace, invalid: :replace, replace: "")
|
37
|
+
.encode("UTF-8")
|
38
|
+
.gsub("\0".encode("UTF-8"), "")
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
require 'linguist'
|
2
|
+
require 'elasticsearch/git/encoder_helper'
|
3
|
+
|
4
|
+
module Elasticsearch
|
5
|
+
module Git
|
6
|
+
class LiteBlob
|
7
|
+
include Linguist::BlobHelper
|
8
|
+
include Elasticsearch::Git::EncoderHelper
|
9
|
+
|
10
|
+
attr_accessor :id, :name, :path, :data, :size, :mode, :commit_id
|
11
|
+
|
12
|
+
def initialize(repo, raw_blob_hash)
|
13
|
+
@id = raw_blob_hash[:oid]
|
14
|
+
|
15
|
+
blob = repo.lookup(@id)
|
16
|
+
|
17
|
+
@mode = raw_blob_hash[:mode].to_s(8)
|
18
|
+
@size = blob.size
|
19
|
+
@path = encode!(raw_blob_hash[:path])
|
20
|
+
@name = @path.split('/').last
|
21
|
+
@data = encode!(blob.content)
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,92 @@
|
|
1
|
+
require 'active_support/concern'
|
2
|
+
require 'active_model'
|
3
|
+
require 'elasticsearch/model'
|
4
|
+
|
5
|
+
module Elasticsearch
|
6
|
+
module Git
|
7
|
+
module Model
|
8
|
+
extend ActiveSupport::Concern
|
9
|
+
|
10
|
+
included do
|
11
|
+
extend ActiveModel::Naming
|
12
|
+
include ActiveModel::Model
|
13
|
+
include Elasticsearch::Model
|
14
|
+
|
15
|
+
env = if defined?(::Rails)
|
16
|
+
::Rails.env.to_s
|
17
|
+
else
|
18
|
+
"undefined"
|
19
|
+
end
|
20
|
+
|
21
|
+
index_name [self.name.downcase, 'index', env].join('-')
|
22
|
+
|
23
|
+
settings \
|
24
|
+
index: {
|
25
|
+
analysis: {
|
26
|
+
analyzer: {
|
27
|
+
human_analyzer: {
|
28
|
+
type: 'custom',
|
29
|
+
tokenizer: 'human_tokenizer',
|
30
|
+
filter: %w(lowercase asciifolding human_ngrams)
|
31
|
+
},
|
32
|
+
path_analyzer: {
|
33
|
+
type: 'custom',
|
34
|
+
tokenizer: 'path_tokenizer',
|
35
|
+
filter: %w(lowercase asciifolding path_ngrams)
|
36
|
+
},
|
37
|
+
sha_analyzer: {
|
38
|
+
type: 'custom',
|
39
|
+
tokenizer: 'sha_tokenizer',
|
40
|
+
filter: %w(lowercase asciifolding sha_ngrams)
|
41
|
+
},
|
42
|
+
code_analyzer: {
|
43
|
+
type: 'custom',
|
44
|
+
tokenizer: 'standard',
|
45
|
+
filter: %w(lowercase asciifolding code_stemmer)
|
46
|
+
}
|
47
|
+
},
|
48
|
+
tokenizer: {
|
49
|
+
sha_tokenizer: {
|
50
|
+
type: "edgeNGram",
|
51
|
+
min_gram: 8,
|
52
|
+
max_gram: 40,
|
53
|
+
token_chars: %w(letter digit)
|
54
|
+
},
|
55
|
+
human_tokenizer: {
|
56
|
+
type: "nGram",
|
57
|
+
min_gram: 1,
|
58
|
+
max_gram: 20,
|
59
|
+
token_chars: %w(letter digit)
|
60
|
+
},
|
61
|
+
path_tokenizer: {
|
62
|
+
type: 'path_hierarchy',
|
63
|
+
reverse: true
|
64
|
+
},
|
65
|
+
},
|
66
|
+
filter: {
|
67
|
+
human_ngrams: {
|
68
|
+
type: "nGram",
|
69
|
+
min_gram: 1,
|
70
|
+
max_gram: 20
|
71
|
+
},
|
72
|
+
sha_ngrams: {
|
73
|
+
type: "edgeNGram",
|
74
|
+
min_gram: 8,
|
75
|
+
max_gram: 40
|
76
|
+
},
|
77
|
+
path_ngrams: {
|
78
|
+
type: "edgeNGram",
|
79
|
+
min_gram: 3,
|
80
|
+
max_gram: 15
|
81
|
+
},
|
82
|
+
code_stemmer: {
|
83
|
+
type: "stemmer",
|
84
|
+
name: "minimal_english"
|
85
|
+
}
|
86
|
+
}
|
87
|
+
}
|
88
|
+
}
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|
92
|
+
end
|
@@ -0,0 +1,570 @@
|
|
1
|
+
require 'active_support/concern'
|
2
|
+
require 'active_model'
|
3
|
+
require 'elasticsearch'
|
4
|
+
require 'elasticsearch/git/model'
|
5
|
+
require 'elasticsearch/git/encoder_helper'
|
6
|
+
require 'elasticsearch/git/lite_blob'
|
7
|
+
require 'rugged'
|
8
|
+
|
9
|
+
module Elasticsearch
|
10
|
+
module Git
|
11
|
+
module Repository
|
12
|
+
class CreateIndexException < StandardError; end
|
13
|
+
|
14
|
+
extend ActiveSupport::Concern
|
15
|
+
|
16
|
+
included do
|
17
|
+
include Elasticsearch::Git::Model
|
18
|
+
include Elasticsearch::Git::EncoderHelper
|
19
|
+
|
20
|
+
mapping _timestamp: { enabled: true } do
|
21
|
+
indexes :blob do
|
22
|
+
indexes :id, type: :string, index_options: 'offsets', search_analyzer: :human_analyzer, analyzer: :human_analyzer
|
23
|
+
indexes :rid, type: :string, index: :not_analyzed
|
24
|
+
indexes :oid, type: :string, index_options: 'offsets', search_analyzer: :code_analyzer, analyzer: :code_analyzer
|
25
|
+
indexes :commit_sha, type: :string, index_options: 'offsets', search_analyzer: :code_analyzer, analyzer: :code_analyzer
|
26
|
+
indexes :path, type: :string, search_analyzer: :path_analyzer, analyzer: :path_analyzer
|
27
|
+
indexes :content, type: :string, index_options: 'offsets', search_analyzer: :code_analyzer, analyzer: :code_analyzer
|
28
|
+
indexes :language, type: :string, index: :not_analyzed
|
29
|
+
end
|
30
|
+
|
31
|
+
indexes :commit do
|
32
|
+
indexes :id, type: :string, index_options: 'offsets', search_analyzer: :human_analyzer, analyzer: :human_analyzer
|
33
|
+
indexes :rid, type: :string, index: :not_analyzed
|
34
|
+
indexes :sha, type: :string, index_options: 'offsets', search_analyzer: :code_analyzer, analyzer: :code_analyzer
|
35
|
+
|
36
|
+
indexes :author do
|
37
|
+
indexes :name, type: :string, index_options: 'offsets', search_analyzer: :code_analyzer, analyzer: :code_analyzer
|
38
|
+
indexes :email, type: :string, index_options: 'offsets', search_analyzer: :code_analyzer, analyzer: :code_analyzer
|
39
|
+
indexes :time, type: :date
|
40
|
+
end
|
41
|
+
|
42
|
+
indexes :commiter do
|
43
|
+
indexes :name, type: :string, index_options: 'offsets', search_analyzer: :code_analyzer, analyzer: :code_analyzer
|
44
|
+
indexes :email, type: :string, index_options: 'offsets', search_analyzer: :code_analyzer, analyzer: :code_analyzer
|
45
|
+
indexes :time, type: :date
|
46
|
+
end
|
47
|
+
|
48
|
+
indexes :message, type: :string, index_options: 'offsets', search_analyzer: :code_analyzer, analyzer: :code_analyzer
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
# Indexing all text-like blobs in repository
|
53
|
+
#
|
54
|
+
# All data stored in global index
|
55
|
+
# Repository can be selected by 'rid' field
|
56
|
+
# If you want - this field can be used for store 'project' id
|
57
|
+
#
|
58
|
+
# blob {
|
59
|
+
# id - uniq id of blob from all repositories
|
60
|
+
# oid - blob id in repository
|
61
|
+
# content - blob content
|
62
|
+
# commit_sha - last actual commit sha
|
63
|
+
# }
|
64
|
+
#
|
65
|
+
# For search from blobs use type 'blob'
|
66
|
+
def index_blobs(from_rev: nil, to_rev: repository_for_indexing.last_commit.oid)
|
67
|
+
from, to = parse_revs(from_rev, to_rev)
|
68
|
+
|
69
|
+
diff = repository_for_indexing.diff(from, to)
|
70
|
+
|
71
|
+
diff.deltas.reverse.each_with_index do |delta, step|
|
72
|
+
if delta.status == :deleted
|
73
|
+
next if delta.old_file[:mode].to_s(8) == "160000"
|
74
|
+
b = LiteBlob.new(repository_for_indexing, delta.old_file)
|
75
|
+
delete_from_index_blob(b)
|
76
|
+
else
|
77
|
+
next if delta.new_file[:mode].to_s(8) == "160000"
|
78
|
+
b = LiteBlob.new(repository_for_indexing, delta.new_file)
|
79
|
+
index_blob(b, to)
|
80
|
+
end
|
81
|
+
|
82
|
+
# Run GC every 100 blobs
|
83
|
+
ObjectSpace.garbage_collect if step % 100 == 0
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
def index_blob(blob, target_sha)
|
88
|
+
if can_index_blob?(blob)
|
89
|
+
tries = 0
|
90
|
+
|
91
|
+
begin
|
92
|
+
client_for_indexing.index \
|
93
|
+
index: "#{self.class.index_name}",
|
94
|
+
type: self.class.name.underscore,
|
95
|
+
id: "#{repository_id}_#{blob.path}",
|
96
|
+
body: {
|
97
|
+
blob: {
|
98
|
+
type: "blob",
|
99
|
+
oid: blob.id,
|
100
|
+
rid: repository_id,
|
101
|
+
content: blob.data,
|
102
|
+
commit_sha: target_sha,
|
103
|
+
path: blob.path,
|
104
|
+
language: blob.language ? blob.language.name : "Text"
|
105
|
+
}
|
106
|
+
}
|
107
|
+
rescue Exception => ex
|
108
|
+
if tries < 2
|
109
|
+
tries += 1
|
110
|
+
sleep 1
|
111
|
+
retry
|
112
|
+
else
|
113
|
+
raise CreateIndexException, "Can't index #{repository_id}_#{blob.path}. Reason: #{ex.message}"
|
114
|
+
end
|
115
|
+
end
|
116
|
+
end
|
117
|
+
end
|
118
|
+
|
119
|
+
# Index text-like files which size less 1.mb
|
120
|
+
def can_index_blob?(blob)
|
121
|
+
blob.text? && (blob.size && blob.size.to_i < 1048576)
|
122
|
+
end
|
123
|
+
|
124
|
+
def delete_from_index_blob(blob)
|
125
|
+
if blob.text?
|
126
|
+
begin
|
127
|
+
client_for_indexing.delete \
|
128
|
+
index: "#{self.class.index_name}",
|
129
|
+
type: "repository",
|
130
|
+
id: "#{repository_id}_#{blob.path}"
|
131
|
+
rescue Elasticsearch::Transport::Transport::Errors::NotFound
|
132
|
+
return true
|
133
|
+
rescue Exception => ex
|
134
|
+
raise CreateIndexException, "Error with removing file from index #{repository_id}_#{blob.path}. Reason: #{ex.message}"
|
135
|
+
end
|
136
|
+
end
|
137
|
+
end
|
138
|
+
|
139
|
+
# Indexing all commits in repository
|
140
|
+
#
|
141
|
+
# All data stored in global index
|
142
|
+
# Repository can be filtered by 'rid' field
|
143
|
+
# If you want - this field can be used git store 'project' id
|
144
|
+
#
|
145
|
+
# commit {
|
146
|
+
# sha - commit sha
|
147
|
+
# author {
|
148
|
+
# name - commit author name
|
149
|
+
# email - commit author email
|
150
|
+
# time - commit time
|
151
|
+
# }
|
152
|
+
# commiter {
|
153
|
+
# name - committer name
|
154
|
+
# email - committer email
|
155
|
+
# time - commit time
|
156
|
+
# }
|
157
|
+
# message - commit message
|
158
|
+
# }
|
159
|
+
#
|
160
|
+
# For search from commits use type 'commit'
|
161
|
+
def index_commits(from_rev: nil, to_rev: repository_for_indexing.last_commit.oid)
|
162
|
+
from, to = parse_revs(from_rev, to_rev)
|
163
|
+
range = [from, to].reject(&:nil?).join('..')
|
164
|
+
out, err, status = Open3.capture3("git log #{range} --format=\"%H\"", chdir: repository_for_indexing.path)
|
165
|
+
|
166
|
+
if status.success? && err.blank?
|
167
|
+
#TODO use rugged walker!!!
|
168
|
+
commit_oids = out.split("\n")
|
169
|
+
|
170
|
+
commit_oids.each_with_index do |commit, step|
|
171
|
+
index_commit(repository_for_indexing.lookup(commit))
|
172
|
+
ObjectSpace.garbage_collect if step % 100 == 0
|
173
|
+
end
|
174
|
+
return commit_oids.count
|
175
|
+
end
|
176
|
+
|
177
|
+
0
|
178
|
+
end
|
179
|
+
|
180
|
+
def index_commit(commit)
|
181
|
+
tries = 0
|
182
|
+
|
183
|
+
begin
|
184
|
+
client_for_indexing.index \
|
185
|
+
index: "#{self.class.index_name}",
|
186
|
+
type: self.class.name.underscore,
|
187
|
+
id: "#{repository_id}_#{commit.oid}",
|
188
|
+
body: {
|
189
|
+
commit: {
|
190
|
+
type: "commit",
|
191
|
+
rid: repository_id,
|
192
|
+
sha: commit.oid,
|
193
|
+
author: commit.author,
|
194
|
+
committer: commit.committer,
|
195
|
+
message: encode!(commit.message)
|
196
|
+
}
|
197
|
+
}
|
198
|
+
rescue Exception => ex
|
199
|
+
if tries < 2
|
200
|
+
tries += 1
|
201
|
+
sleep 1
|
202
|
+
retry
|
203
|
+
else
|
204
|
+
raise CreateIndexException, "Can't index #{repository_id}_#{commit.oid}. Reason: #{ex.message}"
|
205
|
+
end
|
206
|
+
end
|
207
|
+
end
|
208
|
+
|
209
|
+
def parse_revs(from_rev, to_rev)
|
210
|
+
from = if index_new_branch?(from_rev)
|
211
|
+
if to_rev == repository_for_indexing.last_commit.oid
|
212
|
+
nil
|
213
|
+
else
|
214
|
+
merge_base(to_rev)
|
215
|
+
end
|
216
|
+
else
|
217
|
+
from_rev
|
218
|
+
end
|
219
|
+
|
220
|
+
return from, to_rev
|
221
|
+
end
|
222
|
+
|
223
|
+
def index_new_branch?(from)
|
224
|
+
from == '0000000000000000000000000000000000000000'
|
225
|
+
end
|
226
|
+
|
227
|
+
# Representation of repository as indexed json
|
228
|
+
# Attention: It can be very very very huge hash
|
229
|
+
def as_indexed_json(options = {})
|
230
|
+
data = {}
|
231
|
+
data[:blobs] = index_blobs_array
|
232
|
+
data[:commits] = index_commits_array
|
233
|
+
data
|
234
|
+
end
|
235
|
+
|
236
|
+
# Indexing blob from current index
|
237
|
+
def index_blobs_array
|
238
|
+
result = []
|
239
|
+
|
240
|
+
target_sha = repository_for_indexing.head.target.oid
|
241
|
+
|
242
|
+
if repository_for_indexing.bare?
|
243
|
+
tree = repository_for_indexing.lookup(target_sha).tree
|
244
|
+
result.push(recurse_blobs_index_hash(tree))
|
245
|
+
else
|
246
|
+
repository_for_indexing.index.each do |blob|
|
247
|
+
b = LiteBlob.new(repository_for_indexing, blob)
|
248
|
+
result.push(
|
249
|
+
{
|
250
|
+
type: 'blob',
|
251
|
+
id: "#{target_sha}_#{b.path}",
|
252
|
+
rid: repository_id,
|
253
|
+
oid: b.id,
|
254
|
+
content: b.data,
|
255
|
+
commit_sha: target_sha
|
256
|
+
}
|
257
|
+
) if b.text?
|
258
|
+
end
|
259
|
+
end
|
260
|
+
|
261
|
+
result
|
262
|
+
end
|
263
|
+
|
264
|
+
def recurse_blobs_index_hash(tree, path = "")
|
265
|
+
result = []
|
266
|
+
|
267
|
+
tree.each_blob do |blob|
|
268
|
+
blob[:path] = path + blob[:name]
|
269
|
+
b = LiteBlob.new(repository_for_indexing, blob)
|
270
|
+
result.push(
|
271
|
+
{
|
272
|
+
type: 'blob',
|
273
|
+
id: "#{repository_for_indexing.head.target.oid}_#{path}#{blob[:name]}",
|
274
|
+
rid: repository_id,
|
275
|
+
oid: b.id,
|
276
|
+
content: b.data,
|
277
|
+
commit_sha: repository_for_indexing.head.target.oid
|
278
|
+
}
|
279
|
+
) if b.text?
|
280
|
+
end
|
281
|
+
|
282
|
+
tree.each_tree do |nested_tree|
|
283
|
+
result.push(recurse_blobs_index_hash(repository_for_indexing.lookup(nested_tree[:oid]), "#{nested_tree[:name]}/"))
|
284
|
+
end
|
285
|
+
|
286
|
+
result.flatten
|
287
|
+
end
|
288
|
+
|
289
|
+
# Lookup all object ids for commit objects
|
290
|
+
def index_commits_array
|
291
|
+
res = []
|
292
|
+
|
293
|
+
repository_for_indexing.each_id do |oid|
|
294
|
+
obj = repository_for_indexing.lookup(oid)
|
295
|
+
if obj.type == :commit
|
296
|
+
res.push(
|
297
|
+
{
|
298
|
+
type: 'commit',
|
299
|
+
sha: obj.oid,
|
300
|
+
author: obj.author,
|
301
|
+
committer: obj.committer,
|
302
|
+
message: encode!(obj.message)
|
303
|
+
}
|
304
|
+
)
|
305
|
+
end
|
306
|
+
end
|
307
|
+
|
308
|
+
res
|
309
|
+
end
|
310
|
+
|
311
|
+
def search(query, type: :all, page: 1, per: 20, options: {})
|
312
|
+
options[:repository_id] = repository_id if options[:repository_id].nil?
|
313
|
+
self.class.search(query, type: type, page: page, per: per, options: options)
|
314
|
+
end
|
315
|
+
|
316
|
+
# Repository id used for identity data from different repositories
|
317
|
+
# Update this value if need
|
318
|
+
def set_repository_id id = nil
|
319
|
+
@repository_id = id || path_to_repo
|
320
|
+
end
|
321
|
+
|
322
|
+
# For Overwrite
|
323
|
+
def repository_id
|
324
|
+
@repository_id
|
325
|
+
end
|
326
|
+
|
327
|
+
# For Overwrite
|
328
|
+
def self.repositories_count
|
329
|
+
10
|
330
|
+
end
|
331
|
+
|
332
|
+
unless defined?(path_to_repo)
|
333
|
+
def path_to_repo
|
334
|
+
if @path_to_repo.blank?
|
335
|
+
raise NotImplementedError, 'Please, define "path_to_repo" method, or set "path_to_repo" via "repository_for_indexing" method'
|
336
|
+
else
|
337
|
+
@path_to_repo
|
338
|
+
end
|
339
|
+
end
|
340
|
+
end
|
341
|
+
|
342
|
+
def repository_for_indexing(repo_path = nil)
|
343
|
+
return @rugged_repo_indexer if defined? @rugged_repo_indexer
|
344
|
+
|
345
|
+
@path_to_repo ||= repo_path || path_to_repo
|
346
|
+
|
347
|
+
set_repository_id
|
348
|
+
|
349
|
+
@rugged_repo_indexer = Rugged::Repository.new(@path_to_repo)
|
350
|
+
end
|
351
|
+
|
352
|
+
def client_for_indexing
|
353
|
+
@client_for_indexing ||= Elasticsearch::Client.new log: true
|
354
|
+
end
|
355
|
+
|
356
|
+
def self.search(query, type: :all, page: 1, per: 20, options: {})
|
357
|
+
results = { blobs: [], commits: []}
|
358
|
+
|
359
|
+
case type.to_sym
|
360
|
+
when :all
|
361
|
+
results[:blobs] = search_blob(query, page: page, per: per, options: options)
|
362
|
+
results[:commits] = search_commit(query, page: page, per: per, options: options)
|
363
|
+
when :blob
|
364
|
+
results[:blobs] = search_blob(query, page: page, per: per, options: options)
|
365
|
+
when :commit
|
366
|
+
results[:commits] = search_commit(query, page: page, per: per, options: options)
|
367
|
+
end
|
368
|
+
|
369
|
+
results
|
370
|
+
end
|
371
|
+
|
372
|
+
private
|
373
|
+
|
374
|
+
def merge_base(to_rev)
|
375
|
+
head_sha = repository_for_indexing.last_commit.oid
|
376
|
+
repository_for_indexing.merge_base(to_rev, head_sha)
|
377
|
+
end
|
378
|
+
end
|
379
|
+
|
380
|
+
module ClassMethods
|
381
|
+
def search_commit(query, page: 1, per: 20, options: {})
|
382
|
+
page ||= 1
|
383
|
+
|
384
|
+
fields = %w(message^10 sha^5 author.name^2 author.email^2 committer.name committer.email).map {|i| "commit.#{i}"}
|
385
|
+
|
386
|
+
query_hash = {
|
387
|
+
query: {
|
388
|
+
filtered: {
|
389
|
+
query: {
|
390
|
+
multi_match: {
|
391
|
+
fields: fields,
|
392
|
+
query: "#{query}",
|
393
|
+
operator: :or
|
394
|
+
}
|
395
|
+
},
|
396
|
+
},
|
397
|
+
},
|
398
|
+
aggs: {
|
399
|
+
commitRepositoryFaset: {
|
400
|
+
terms: {
|
401
|
+
field: "commit.rid",
|
402
|
+
all_terms: true,
|
403
|
+
size: repositories_count
|
404
|
+
}
|
405
|
+
}
|
406
|
+
},
|
407
|
+
size: per,
|
408
|
+
from: per * (page - 1)
|
409
|
+
}
|
410
|
+
|
411
|
+
if query.blank?
|
412
|
+
query_hash[:query][:filtered][:query] = { match_all: {}}
|
413
|
+
query_hash[:track_scores] = true
|
414
|
+
end
|
415
|
+
|
416
|
+
if options[:repository_id]
|
417
|
+
query_hash[:query][:filtered][:filter] ||= { and: [] }
|
418
|
+
query_hash[:query][:filtered][:filter][:and] << {
|
419
|
+
terms: {
|
420
|
+
"commit.rid" => [options[:repository_id]].flatten
|
421
|
+
}
|
422
|
+
}
|
423
|
+
end
|
424
|
+
|
425
|
+
if options[:highlight]
|
426
|
+
es_fields = fields.map { |field| field.split('^').first }.inject({}) do |memo, field|
|
427
|
+
memo[field.to_sym] = {}
|
428
|
+
memo
|
429
|
+
end
|
430
|
+
|
431
|
+
query_hash[:highlight] = {
|
432
|
+
pre_tags: ["gitlabelasticsearch→"],
|
433
|
+
post_tags: ["←gitlabelasticsearch"],
|
434
|
+
fields: es_fields
|
435
|
+
}
|
436
|
+
end
|
437
|
+
|
438
|
+
options[:order] = :default if options[:order].blank?
|
439
|
+
|
440
|
+
order = case options[:order].to_sym
|
441
|
+
when :recently_indexed
|
442
|
+
{ _timestamp: { order: :desc, mode: :min } }
|
443
|
+
when :last_indexed
|
444
|
+
{ _timestamp: { order: :asc, mode: :min } }
|
445
|
+
else
|
446
|
+
{}
|
447
|
+
end
|
448
|
+
|
449
|
+
query_hash[:sort] = order.blank? ? [:_score] : [order, :_score]
|
450
|
+
|
451
|
+
res = self.__elasticsearch__.search(query_hash)
|
452
|
+
{
|
453
|
+
results: res.results,
|
454
|
+
total_count: res.size,
|
455
|
+
repositories: res.response["aggregations"]["commitRepositoryFaset"]["buckets"]
|
456
|
+
}
|
457
|
+
end
|
458
|
+
|
459
|
+
def search_blob(query, type: :all, page: 1, per: 20, options: {})
|
460
|
+
page ||= 1
|
461
|
+
|
462
|
+
query_hash = {
|
463
|
+
query: {
|
464
|
+
filtered: {
|
465
|
+
query: {
|
466
|
+
match: {
|
467
|
+
'blob.content' => {
|
468
|
+
query: "#{query}",
|
469
|
+
operator: :and
|
470
|
+
}
|
471
|
+
}
|
472
|
+
}
|
473
|
+
}
|
474
|
+
},
|
475
|
+
aggs: {
|
476
|
+
languageFacet: {
|
477
|
+
terms: {
|
478
|
+
field: :language,
|
479
|
+
all_terms: true,
|
480
|
+
size: 20
|
481
|
+
}
|
482
|
+
},
|
483
|
+
blobRepositoryFaset: {
|
484
|
+
terms: {
|
485
|
+
field: :rid,
|
486
|
+
all_terms: true,
|
487
|
+
size: repositories_count
|
488
|
+
}
|
489
|
+
}
|
490
|
+
},
|
491
|
+
size: per,
|
492
|
+
from: per * (page - 1)
|
493
|
+
}
|
494
|
+
|
495
|
+
if options[:repository_id]
|
496
|
+
query_hash[:query][:filtered][:filter] ||= { and: [] }
|
497
|
+
query_hash[:query][:filtered][:filter][:and] << {
|
498
|
+
terms: {
|
499
|
+
"blob.rid" => [options[:repository_id]].flatten
|
500
|
+
}
|
501
|
+
}
|
502
|
+
end
|
503
|
+
|
504
|
+
if options[:language]
|
505
|
+
query_hash[:query][:filtered][:filter] ||= { and: [] }
|
506
|
+
query_hash[:query][:filtered][:filter][:and] << {
|
507
|
+
terms: {
|
508
|
+
"blob.language" => [options[:language]].flatten
|
509
|
+
}
|
510
|
+
}
|
511
|
+
end
|
512
|
+
|
513
|
+
options[:order] = :default if options[:order].blank?
|
514
|
+
|
515
|
+
order = case options[:order].to_sym
|
516
|
+
when :recently_indexed
|
517
|
+
{ _timestamp: { order: :desc, mode: :min } }
|
518
|
+
when :last_indexed
|
519
|
+
{ _timestamp: { order: :asc, mode: :min } }
|
520
|
+
else
|
521
|
+
{}
|
522
|
+
end
|
523
|
+
|
524
|
+
query_hash[:sort] = order.blank? ? [:_score] : [order, :_score]
|
525
|
+
|
526
|
+
if options[:highlight]
|
527
|
+
query_hash[:highlight] = {
|
528
|
+
pre_tags: ["gitlabelasticsearch→"],
|
529
|
+
post_tags: ["←gitlabelasticsearch"],
|
530
|
+
fields: {
|
531
|
+
"blob.content" => {},
|
532
|
+
"type" => "fvh",
|
533
|
+
"boundary_chars" => "\n"
|
534
|
+
}
|
535
|
+
}
|
536
|
+
end
|
537
|
+
|
538
|
+
res = self.__elasticsearch__.search(query_hash)
|
539
|
+
|
540
|
+
{
|
541
|
+
results: res.results,
|
542
|
+
total_count: res.size,
|
543
|
+
languages: res.response["aggregations"]["languageFacet"]["buckets"],
|
544
|
+
repositories: res.response["aggregations"]["blobRepositoryFaset"]["buckets"]
|
545
|
+
}
|
546
|
+
end
|
547
|
+
|
548
|
+
def search_file_names(query, page: 1, per: 20, options: {})
|
549
|
+
query_hash = {
|
550
|
+
fields: ['blob.path'],
|
551
|
+
query: {
|
552
|
+
fuzzy: {
|
553
|
+
"repository.blob.path" => { value: query }
|
554
|
+
},
|
555
|
+
},
|
556
|
+
filter: {
|
557
|
+
term: {
|
558
|
+
"repository.blob.rid" => [options[:repository_id]].flatten
|
559
|
+
}
|
560
|
+
},
|
561
|
+
size: per,
|
562
|
+
from: per * (page - 1)
|
563
|
+
}
|
564
|
+
|
565
|
+
self.__elasticsearch__.search(query_hash)
|
566
|
+
end
|
567
|
+
end
|
568
|
+
end
|
569
|
+
end
|
570
|
+
end
|