gitlab-elasticsearch-git 0.0.7 → 0.0.8
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG +12 -4
- data/gitlab-elasticsearch-git.gemspec +6 -6
- data/lib/elasticsearch/git/model.rb +3 -18
- data/lib/elasticsearch/git/repository.rb +98 -95
- data/lib/elasticsearch/git/version.rb +1 -1
- data/test/test_helper.rb +1 -1
- metadata +17 -17
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 407b4f13d2f80ccce7365e649318b811fcf3d164
|
4
|
+
data.tar.gz: 80651810beb864805f008c8ecc7d51201eb6b4fc
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 276867d9624cccaba15a97c0847222fe57958394023d5d7afdb52f7c3e9f315770d5f8dac4393f899c8130d8e545ebdaf128f6064503a981722c0951a00a0c8e
|
7
|
+
data.tar.gz: 6ad0348184aa893e97b379e31e6b2b9dcfea7002ddd8bd80e6b2ec211b734a369ecee4149e73c27e536f3666aa6dfe54d3dd993c6f4d9b1ccdbe78a94c4d2d70
|
data/CHANGELOG
CHANGED
@@ -1,9 +1,17 @@
|
|
1
|
+
0.0.8
|
2
|
+
- Using Elastic bulk API
|
3
|
+
- Optimisations of index mappings
|
4
|
+
- Performance optimization
|
5
|
+
|
6
|
+
0.0.7
|
7
|
+
- Refactoring
|
8
|
+
|
1
9
|
0.0.6
|
2
|
-
|
10
|
+
- Support elasticsearch-model gem version 0.1.8
|
3
11
|
|
4
12
|
0.0.5
|
5
|
-
|
6
|
-
|
13
|
+
- Search by file name on master branch
|
14
|
+
- Migrate from elasticsearch 0.9.x -> 1.x
|
7
15
|
|
8
16
|
0.0.4
|
9
|
-
- Stable version
|
17
|
+
- Stable version
|
@@ -18,11 +18,11 @@ Gem::Specification.new do |spec|
|
|
18
18
|
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
19
19
|
spec.require_paths = ["lib"]
|
20
20
|
|
21
|
-
spec.add_runtime_dependency 'elasticsearch-model'
|
22
|
-
spec.add_runtime_dependency 'elasticsearch-api', '~> 1.0
|
21
|
+
spec.add_runtime_dependency 'elasticsearch-model', '~> 1.0'
|
22
|
+
spec.add_runtime_dependency 'elasticsearch-api', '~> 1.0'
|
23
23
|
spec.add_runtime_dependency 'rugged', '~> 0.23.3'
|
24
|
-
spec.add_runtime_dependency 'charlock_holmes', '~> 0.7
|
25
|
-
spec.add_runtime_dependency 'github-linguist', '~> 4.7
|
26
|
-
spec.add_runtime_dependency 'activemodel', '~> 4.2
|
27
|
-
spec.add_runtime_dependency 'activesupport', '~> 4.2
|
24
|
+
spec.add_runtime_dependency 'charlock_holmes', '~> 0.7'
|
25
|
+
spec.add_runtime_dependency 'github-linguist', '~> 4.7'
|
26
|
+
spec.add_runtime_dependency 'activemodel', '~> 4.2'
|
27
|
+
spec.add_runtime_dependency 'activesupport', '~> 4.2'
|
28
28
|
end
|
@@ -27,17 +27,17 @@ module Elasticsearch
|
|
27
27
|
human_analyzer: {
|
28
28
|
type: 'custom',
|
29
29
|
tokenizer: 'human_tokenizer',
|
30
|
-
filter: %w(lowercase asciifolding
|
30
|
+
filter: %w(lowercase asciifolding)
|
31
31
|
},
|
32
32
|
path_analyzer: {
|
33
33
|
type: 'custom',
|
34
34
|
tokenizer: 'path_tokenizer',
|
35
|
-
filter: %w(lowercase asciifolding
|
35
|
+
filter: %w(lowercase asciifolding)
|
36
36
|
},
|
37
37
|
sha_analyzer: {
|
38
38
|
type: 'custom',
|
39
39
|
tokenizer: 'sha_tokenizer',
|
40
|
-
filter: %w(lowercase asciifolding
|
40
|
+
filter: %w(lowercase asciifolding)
|
41
41
|
},
|
42
42
|
code_analyzer: {
|
43
43
|
type: 'custom',
|
@@ -64,21 +64,6 @@ module Elasticsearch
|
|
64
64
|
},
|
65
65
|
},
|
66
66
|
filter: {
|
67
|
-
human_ngrams: {
|
68
|
-
type: "nGram",
|
69
|
-
min_gram: 1,
|
70
|
-
max_gram: 20
|
71
|
-
},
|
72
|
-
sha_ngrams: {
|
73
|
-
type: "edgeNGram",
|
74
|
-
min_gram: 8,
|
75
|
-
max_gram: 40
|
76
|
-
},
|
77
|
-
path_ngrams: {
|
78
|
-
type: "edgeNGram",
|
79
|
-
min_gram: 3,
|
80
|
-
max_gram: 15
|
81
|
-
},
|
82
67
|
code_stemmer: {
|
83
68
|
type: "stemmer",
|
84
69
|
name: "minimal_english"
|
@@ -11,6 +11,9 @@ module Elasticsearch
|
|
11
11
|
module Repository
|
12
12
|
class CreateIndexException < StandardError; end
|
13
13
|
|
14
|
+
BLOBS_BATCH = 100
|
15
|
+
COMMMITS_BATCH = 500
|
16
|
+
|
14
17
|
extend ActiveSupport::Concern
|
15
18
|
|
16
19
|
included do
|
@@ -19,33 +22,33 @@ module Elasticsearch
|
|
19
22
|
|
20
23
|
mapping _timestamp: { enabled: true } do
|
21
24
|
indexes :blob do
|
22
|
-
indexes :id, type: :string, index_options: 'offsets',
|
25
|
+
indexes :id, type: :string, index_options: 'offsets', analyzer: :human_analyzer
|
23
26
|
indexes :rid, type: :string, index: :not_analyzed
|
24
|
-
indexes :oid, type: :string, index_options: 'offsets',
|
25
|
-
indexes :commit_sha, type: :string, index_options: 'offsets',
|
26
|
-
indexes :path, type: :string,
|
27
|
-
indexes :content, type: :string, index_options: 'offsets',
|
27
|
+
indexes :oid, type: :string, index_options: 'offsets', analyzer: :code_analyzer
|
28
|
+
indexes :commit_sha, type: :string, index_options: 'offsets', analyzer: :sha_analyzer
|
29
|
+
indexes :path, type: :string, analyzer: :path_analyzer
|
30
|
+
indexes :content, type: :string, index_options: 'offsets', analyzer: :code_analyzer
|
28
31
|
indexes :language, type: :string, index: :not_analyzed
|
29
32
|
end
|
30
33
|
|
31
34
|
indexes :commit do
|
32
|
-
indexes :id, type: :string, index_options: 'offsets',
|
35
|
+
indexes :id, type: :string, index_options: 'offsets', analyzer: :human_analyzer
|
33
36
|
indexes :rid, type: :string, index: :not_analyzed
|
34
|
-
indexes :sha, type: :string, index_options: 'offsets',
|
37
|
+
indexes :sha, type: :string, index_options: 'offsets', analyzer: :sha_analyzer
|
35
38
|
|
36
39
|
indexes :author do
|
37
|
-
indexes :name, type: :string, index_options: 'offsets',
|
38
|
-
indexes :email, type: :string, index_options: 'offsets',
|
39
|
-
indexes :time, type: :date
|
40
|
+
indexes :name, type: :string, index_options: 'offsets', analyzer: :code_analyzer
|
41
|
+
indexes :email, type: :string, index_options: 'offsets', analyzer: :code_analyzer
|
42
|
+
indexes :time, type: :date, format: :basic_date_time_no_millis
|
40
43
|
end
|
41
44
|
|
42
45
|
indexes :commiter do
|
43
|
-
indexes :name, type: :string, index_options: 'offsets',
|
44
|
-
indexes :email, type: :string, index_options: 'offsets',
|
45
|
-
indexes :time, type: :date
|
46
|
+
indexes :name, type: :string, index_options: 'offsets', analyzer: :code_analyzer
|
47
|
+
indexes :email, type: :string, index_options: 'offsets', analyzer: :code_analyzer
|
48
|
+
indexes :time, type: :date, format: :basic_date_time_no_millis
|
46
49
|
end
|
47
50
|
|
48
|
-
indexes :message, type: :string, index_options: 'offsets',
|
51
|
+
indexes :message, type: :string, index_options: 'offsets', analyzer: :code_analyzer
|
49
52
|
end
|
50
53
|
end
|
51
54
|
|
@@ -68,52 +71,52 @@ module Elasticsearch
|
|
68
71
|
|
69
72
|
diff = repository_for_indexing.diff(from, to)
|
70
73
|
|
71
|
-
diff.deltas.reverse.
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
74
|
+
diff.deltas.reverse.each_slice(BLOBS_BATCH) do |slice|
|
75
|
+
bulk_operations = slice.map do |delta|
|
76
|
+
if delta.status == :deleted
|
77
|
+
next if delta.old_file[:mode].to_s(8) == "160000"
|
78
|
+
b = LiteBlob.new(repository_for_indexing, delta.old_file)
|
79
|
+
delete_blob(b)
|
80
|
+
else
|
81
|
+
next if delta.new_file[:mode].to_s(8) == "160000"
|
82
|
+
b = LiteBlob.new(repository_for_indexing, delta.new_file)
|
83
|
+
index_blob(b, to)
|
84
|
+
end
|
80
85
|
end
|
81
86
|
|
82
|
-
|
83
|
-
ObjectSpace.garbage_collect if step % 100 == 0
|
87
|
+
perform_bulk bulk_operations
|
84
88
|
end
|
85
89
|
end
|
86
90
|
|
91
|
+
def perform_bulk(bulk_operations)
|
92
|
+
client_for_indexing.bulk body: bulk_operations.compact
|
93
|
+
end
|
94
|
+
|
95
|
+
|
96
|
+
def delete_blob(blob)
|
97
|
+
return unless blob.text?
|
98
|
+
{ delete: { _index: "#{self.class.index_name}", _type: self.class.name.underscore, _id: "#{repository_id}_#{blob.path}" } }
|
99
|
+
end
|
100
|
+
|
101
|
+
|
87
102
|
def index_blob(blob, target_sha)
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
content: blob.data,
|
102
|
-
commit_sha: target_sha,
|
103
|
-
path: blob.path,
|
104
|
-
language: blob.language ? blob.language.name : "Text"
|
105
|
-
}
|
103
|
+
return unless can_index_blob?(blob)
|
104
|
+
{
|
105
|
+
index: {
|
106
|
+
_index: "#{self.class.index_name}", _type: self.class.name.underscore, _id: "#{repository_id}_#{blob.path}",
|
107
|
+
data: {
|
108
|
+
blob: {
|
109
|
+
type: "blob",
|
110
|
+
oid: blob.id,
|
111
|
+
rid: repository_id,
|
112
|
+
content: blob.data,
|
113
|
+
commit_sha: target_sha,
|
114
|
+
path: blob.path,
|
115
|
+
language: blob.language ? blob.language.name : "Text"
|
106
116
|
}
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
sleep 1
|
111
|
-
retry
|
112
|
-
else
|
113
|
-
raise CreateIndexException, "Can't index #{repository_id}_#{blob.path}. Reason: #{ex.message}"
|
114
|
-
end
|
115
|
-
end
|
116
|
-
end
|
117
|
+
}
|
118
|
+
}
|
119
|
+
}
|
117
120
|
end
|
118
121
|
|
119
122
|
# Index text-like files which size less 1.mb
|
@@ -121,21 +124,6 @@ module Elasticsearch
|
|
121
124
|
blob.text? && (blob.size && blob.size.to_i < 1048576)
|
122
125
|
end
|
123
126
|
|
124
|
-
def delete_from_index_blob(blob)
|
125
|
-
if blob.text?
|
126
|
-
begin
|
127
|
-
client_for_indexing.delete \
|
128
|
-
index: "#{self.class.index_name}",
|
129
|
-
type: "repository",
|
130
|
-
id: "#{repository_id}_#{blob.path}"
|
131
|
-
rescue Elasticsearch::Transport::Transport::Errors::NotFound
|
132
|
-
return true
|
133
|
-
rescue Exception => ex
|
134
|
-
raise CreateIndexException, "Error with removing file from index #{repository_id}_#{blob.path}. Reason: #{ex.message}"
|
135
|
-
end
|
136
|
-
end
|
137
|
-
end
|
138
|
-
|
139
127
|
# Indexing all commits in repository
|
140
128
|
#
|
141
129
|
# All data stored in global index
|
@@ -160,50 +148,65 @@ module Elasticsearch
|
|
160
148
|
# For search from commits use type 'commit'
|
161
149
|
def index_commits(from_rev: nil, to_rev: repository_for_indexing.last_commit.oid)
|
162
150
|
from, to = parse_revs(from_rev, to_rev)
|
163
|
-
range = [from, to].
|
151
|
+
range = [from, to].compact.join('..')
|
164
152
|
out, err, status = Open3.capture3("git log #{range} --format=\"%H\"", chdir: repository_for_indexing.path)
|
165
153
|
|
166
154
|
if status.success? && err.blank?
|
155
|
+
queue = Queue.new
|
156
|
+
|
157
|
+
workers = (0...3).map do
|
158
|
+
Thread.new do
|
159
|
+
while bulk_operations = queue.pop
|
160
|
+
perform_bulk bulk_operations
|
161
|
+
end
|
162
|
+
end
|
163
|
+
end
|
164
|
+
|
167
165
|
#TODO use rugged walker!!!
|
168
166
|
commit_oids = out.split("\n")
|
169
167
|
|
170
|
-
commit_oids.
|
171
|
-
|
172
|
-
|
168
|
+
commit_oids.each_slice(COMMMITS_BATCH) do |batch|
|
169
|
+
bulk_operations = batch.map do |commit|
|
170
|
+
index_commit(repository_for_indexing.lookup(commit))
|
171
|
+
end
|
172
|
+
|
173
|
+
# perform_bulk bulk_operations
|
174
|
+
queue << bulk_operations
|
175
|
+
end
|
176
|
+
|
177
|
+
while queue.num_waiting < workers.count
|
178
|
+
sleep 0.1
|
173
179
|
end
|
174
|
-
return commit_oids.count
|
175
|
-
end
|
176
180
|
|
177
|
-
|
181
|
+
# Kill off each thread now that they're idle and exit
|
182
|
+
workers.each(&:exit)
|
183
|
+
end
|
178
184
|
end
|
179
185
|
|
180
186
|
def index_commit(commit)
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
index: "#{self.class.index_name}",
|
186
|
-
type: self.class.name.underscore,
|
187
|
-
id: "#{repository_id}_#{commit.oid}",
|
188
|
-
body: {
|
187
|
+
{
|
188
|
+
index: {
|
189
|
+
_index: "#{self.class.index_name}", _type: self.class.name.underscore, _id: "#{repository_id}_#{commit.oid}",
|
190
|
+
data: {
|
189
191
|
commit: {
|
190
192
|
type: "commit",
|
191
193
|
rid: repository_id,
|
192
194
|
sha: commit.oid,
|
193
|
-
author:
|
194
|
-
|
195
|
+
author: {
|
196
|
+
name: commit.author[:name],
|
197
|
+
email: commit.author[:email],
|
198
|
+
time: commit.author[:time].strftime('%Y%m%dT%H%M%S%z'),
|
199
|
+
},
|
200
|
+
committer: {
|
201
|
+
name: commit.committer[:name],
|
202
|
+
email: commit.committer[:email],
|
203
|
+
time: commit.committer[:time].strftime('%Y%m%dT%H%M%S%z'),
|
204
|
+
},
|
195
205
|
message: encode!(commit.message)
|
196
206
|
}
|
197
207
|
}
|
198
|
-
|
199
|
-
|
200
|
-
tries += 1
|
201
|
-
sleep 1
|
202
|
-
retry
|
203
|
-
else
|
204
|
-
raise CreateIndexException, "Can't index #{repository_id}_#{commit.oid}. Reason: #{ex.message}"
|
205
|
-
end
|
206
|
-
end
|
208
|
+
}
|
209
|
+
}
|
207
210
|
end
|
208
211
|
|
209
212
|
def parse_revs(from_rev, to_rev)
|
@@ -350,7 +353,7 @@ module Elasticsearch
|
|
350
353
|
end
|
351
354
|
|
352
355
|
def client_for_indexing
|
353
|
-
@client_for_indexing ||= Elasticsearch::Client.new
|
356
|
+
@client_for_indexing ||= Elasticsearch::Client.new retry_on_failure: 5
|
354
357
|
end
|
355
358
|
|
356
359
|
def self.search(query, type: :all, page: 1, per: 20, options: {})
|
data/test/test_helper.rb
CHANGED
@@ -4,7 +4,7 @@ require 'pry'
|
|
4
4
|
require 'elasticsearch/git'
|
5
5
|
|
6
6
|
SUPPORT_PATH = File.join(File.expand_path(File.dirname(__FILE__)), '../support')
|
7
|
-
TEST_REPO_PATH = File.join(SUPPORT_PATH, '
|
7
|
+
TEST_REPO_PATH = File.join(SUPPORT_PATH, 'gitlab-test.git')
|
8
8
|
|
9
9
|
require_relative 'support/seed_helper'
|
10
10
|
require_relative 'support/repository'
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: gitlab-elasticsearch-git
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.8
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrey Kumanyaev
|
@@ -10,36 +10,36 @@ authors:
|
|
10
10
|
autorequire:
|
11
11
|
bindir: bin
|
12
12
|
cert_chain: []
|
13
|
-
date: 2016-01-
|
13
|
+
date: 2016-01-27 00:00:00.000000000 Z
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
16
16
|
name: elasticsearch-model
|
17
17
|
requirement: !ruby/object:Gem::Requirement
|
18
18
|
requirements:
|
19
|
-
- - "
|
19
|
+
- - "~>"
|
20
20
|
- !ruby/object:Gem::Version
|
21
|
-
version: '0'
|
21
|
+
version: '1.0'
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
24
|
version_requirements: !ruby/object:Gem::Requirement
|
25
25
|
requirements:
|
26
|
-
- - "
|
26
|
+
- - "~>"
|
27
27
|
- !ruby/object:Gem::Version
|
28
|
-
version: '0'
|
28
|
+
version: '1.0'
|
29
29
|
- !ruby/object:Gem::Dependency
|
30
30
|
name: elasticsearch-api
|
31
31
|
requirement: !ruby/object:Gem::Requirement
|
32
32
|
requirements:
|
33
33
|
- - "~>"
|
34
34
|
- !ruby/object:Gem::Version
|
35
|
-
version: 1.0
|
35
|
+
version: '1.0'
|
36
36
|
type: :runtime
|
37
37
|
prerelease: false
|
38
38
|
version_requirements: !ruby/object:Gem::Requirement
|
39
39
|
requirements:
|
40
40
|
- - "~>"
|
41
41
|
- !ruby/object:Gem::Version
|
42
|
-
version: 1.0
|
42
|
+
version: '1.0'
|
43
43
|
- !ruby/object:Gem::Dependency
|
44
44
|
name: rugged
|
45
45
|
requirement: !ruby/object:Gem::Requirement
|
@@ -60,56 +60,56 @@ dependencies:
|
|
60
60
|
requirements:
|
61
61
|
- - "~>"
|
62
62
|
- !ruby/object:Gem::Version
|
63
|
-
version: 0.7
|
63
|
+
version: '0.7'
|
64
64
|
type: :runtime
|
65
65
|
prerelease: false
|
66
66
|
version_requirements: !ruby/object:Gem::Requirement
|
67
67
|
requirements:
|
68
68
|
- - "~>"
|
69
69
|
- !ruby/object:Gem::Version
|
70
|
-
version: 0.7
|
70
|
+
version: '0.7'
|
71
71
|
- !ruby/object:Gem::Dependency
|
72
72
|
name: github-linguist
|
73
73
|
requirement: !ruby/object:Gem::Requirement
|
74
74
|
requirements:
|
75
75
|
- - "~>"
|
76
76
|
- !ruby/object:Gem::Version
|
77
|
-
version: 4.7
|
77
|
+
version: '4.7'
|
78
78
|
type: :runtime
|
79
79
|
prerelease: false
|
80
80
|
version_requirements: !ruby/object:Gem::Requirement
|
81
81
|
requirements:
|
82
82
|
- - "~>"
|
83
83
|
- !ruby/object:Gem::Version
|
84
|
-
version: 4.7
|
84
|
+
version: '4.7'
|
85
85
|
- !ruby/object:Gem::Dependency
|
86
86
|
name: activemodel
|
87
87
|
requirement: !ruby/object:Gem::Requirement
|
88
88
|
requirements:
|
89
89
|
- - "~>"
|
90
90
|
- !ruby/object:Gem::Version
|
91
|
-
version: 4.2
|
91
|
+
version: '4.2'
|
92
92
|
type: :runtime
|
93
93
|
prerelease: false
|
94
94
|
version_requirements: !ruby/object:Gem::Requirement
|
95
95
|
requirements:
|
96
96
|
- - "~>"
|
97
97
|
- !ruby/object:Gem::Version
|
98
|
-
version: 4.2
|
98
|
+
version: '4.2'
|
99
99
|
- !ruby/object:Gem::Dependency
|
100
100
|
name: activesupport
|
101
101
|
requirement: !ruby/object:Gem::Requirement
|
102
102
|
requirements:
|
103
103
|
- - "~>"
|
104
104
|
- !ruby/object:Gem::Version
|
105
|
-
version: 4.2
|
105
|
+
version: '4.2'
|
106
106
|
type: :runtime
|
107
107
|
prerelease: false
|
108
108
|
version_requirements: !ruby/object:Gem::Requirement
|
109
109
|
requirements:
|
110
110
|
- - "~>"
|
111
111
|
- !ruby/object:Gem::Version
|
112
|
-
version: 4.2
|
112
|
+
version: '4.2'
|
113
113
|
description: Elasticsearch integrations for indexing git repositories.
|
114
114
|
email:
|
115
115
|
- me@zzet.org
|
@@ -157,7 +157,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
157
157
|
version: '0'
|
158
158
|
requirements: []
|
159
159
|
rubyforge_project:
|
160
|
-
rubygems_version: 2.4.
|
160
|
+
rubygems_version: 2.4.8
|
161
161
|
signing_key:
|
162
162
|
specification_version: 4
|
163
163
|
summary: Elasticsearch integrations for git repositories.
|