gitlab-elasticsearch-git 0.0.7 → 0.0.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG +12 -4
- data/gitlab-elasticsearch-git.gemspec +6 -6
- data/lib/elasticsearch/git/model.rb +3 -18
- data/lib/elasticsearch/git/repository.rb +98 -95
- data/lib/elasticsearch/git/version.rb +1 -1
- data/test/test_helper.rb +1 -1
- metadata +17 -17
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 407b4f13d2f80ccce7365e649318b811fcf3d164
|
4
|
+
data.tar.gz: 80651810beb864805f008c8ecc7d51201eb6b4fc
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 276867d9624cccaba15a97c0847222fe57958394023d5d7afdb52f7c3e9f315770d5f8dac4393f899c8130d8e545ebdaf128f6064503a981722c0951a00a0c8e
|
7
|
+
data.tar.gz: 6ad0348184aa893e97b379e31e6b2b9dcfea7002ddd8bd80e6b2ec211b734a369ecee4149e73c27e536f3666aa6dfe54d3dd993c6f4d9b1ccdbe78a94c4d2d70
|
data/CHANGELOG
CHANGED
@@ -1,9 +1,17 @@
|
|
1
|
+
0.0.8
|
2
|
+
- Using Elastic bulk API
|
3
|
+
- Optimisations of index mappings
|
4
|
+
- Performance optimization
|
5
|
+
|
6
|
+
0.0.7
|
7
|
+
- Refactoring
|
8
|
+
|
1
9
|
0.0.6
|
2
|
-
|
10
|
+
- Support elasticsearch-model gem version 0.1.8
|
3
11
|
|
4
12
|
0.0.5
|
5
|
-
|
6
|
-
|
13
|
+
- Search by file name on master branch
|
14
|
+
- Migrate from elasticsearch 0.9.x -> 1.x
|
7
15
|
|
8
16
|
0.0.4
|
9
|
-
- Stable version
|
17
|
+
- Stable version
|
@@ -18,11 +18,11 @@ Gem::Specification.new do |spec|
|
|
18
18
|
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
19
19
|
spec.require_paths = ["lib"]
|
20
20
|
|
21
|
-
spec.add_runtime_dependency 'elasticsearch-model'
|
22
|
-
spec.add_runtime_dependency 'elasticsearch-api', '~> 1.0
|
21
|
+
spec.add_runtime_dependency 'elasticsearch-model', '~> 1.0'
|
22
|
+
spec.add_runtime_dependency 'elasticsearch-api', '~> 1.0'
|
23
23
|
spec.add_runtime_dependency 'rugged', '~> 0.23.3'
|
24
|
-
spec.add_runtime_dependency 'charlock_holmes', '~> 0.7
|
25
|
-
spec.add_runtime_dependency 'github-linguist', '~> 4.7
|
26
|
-
spec.add_runtime_dependency 'activemodel', '~> 4.2
|
27
|
-
spec.add_runtime_dependency 'activesupport', '~> 4.2
|
24
|
+
spec.add_runtime_dependency 'charlock_holmes', '~> 0.7'
|
25
|
+
spec.add_runtime_dependency 'github-linguist', '~> 4.7'
|
26
|
+
spec.add_runtime_dependency 'activemodel', '~> 4.2'
|
27
|
+
spec.add_runtime_dependency 'activesupport', '~> 4.2'
|
28
28
|
end
|
@@ -27,17 +27,17 @@ module Elasticsearch
|
|
27
27
|
human_analyzer: {
|
28
28
|
type: 'custom',
|
29
29
|
tokenizer: 'human_tokenizer',
|
30
|
-
filter: %w(lowercase asciifolding
|
30
|
+
filter: %w(lowercase asciifolding)
|
31
31
|
},
|
32
32
|
path_analyzer: {
|
33
33
|
type: 'custom',
|
34
34
|
tokenizer: 'path_tokenizer',
|
35
|
-
filter: %w(lowercase asciifolding
|
35
|
+
filter: %w(lowercase asciifolding)
|
36
36
|
},
|
37
37
|
sha_analyzer: {
|
38
38
|
type: 'custom',
|
39
39
|
tokenizer: 'sha_tokenizer',
|
40
|
-
filter: %w(lowercase asciifolding
|
40
|
+
filter: %w(lowercase asciifolding)
|
41
41
|
},
|
42
42
|
code_analyzer: {
|
43
43
|
type: 'custom',
|
@@ -64,21 +64,6 @@ module Elasticsearch
|
|
64
64
|
},
|
65
65
|
},
|
66
66
|
filter: {
|
67
|
-
human_ngrams: {
|
68
|
-
type: "nGram",
|
69
|
-
min_gram: 1,
|
70
|
-
max_gram: 20
|
71
|
-
},
|
72
|
-
sha_ngrams: {
|
73
|
-
type: "edgeNGram",
|
74
|
-
min_gram: 8,
|
75
|
-
max_gram: 40
|
76
|
-
},
|
77
|
-
path_ngrams: {
|
78
|
-
type: "edgeNGram",
|
79
|
-
min_gram: 3,
|
80
|
-
max_gram: 15
|
81
|
-
},
|
82
67
|
code_stemmer: {
|
83
68
|
type: "stemmer",
|
84
69
|
name: "minimal_english"
|
@@ -11,6 +11,9 @@ module Elasticsearch
|
|
11
11
|
module Repository
|
12
12
|
class CreateIndexException < StandardError; end
|
13
13
|
|
14
|
+
BLOBS_BATCH = 100
|
15
|
+
COMMMITS_BATCH = 500
|
16
|
+
|
14
17
|
extend ActiveSupport::Concern
|
15
18
|
|
16
19
|
included do
|
@@ -19,33 +22,33 @@ module Elasticsearch
|
|
19
22
|
|
20
23
|
mapping _timestamp: { enabled: true } do
|
21
24
|
indexes :blob do
|
22
|
-
indexes :id, type: :string, index_options: 'offsets',
|
25
|
+
indexes :id, type: :string, index_options: 'offsets', analyzer: :human_analyzer
|
23
26
|
indexes :rid, type: :string, index: :not_analyzed
|
24
|
-
indexes :oid, type: :string, index_options: 'offsets',
|
25
|
-
indexes :commit_sha, type: :string, index_options: 'offsets',
|
26
|
-
indexes :path, type: :string,
|
27
|
-
indexes :content, type: :string, index_options: 'offsets',
|
27
|
+
indexes :oid, type: :string, index_options: 'offsets', analyzer: :code_analyzer
|
28
|
+
indexes :commit_sha, type: :string, index_options: 'offsets', analyzer: :sha_analyzer
|
29
|
+
indexes :path, type: :string, analyzer: :path_analyzer
|
30
|
+
indexes :content, type: :string, index_options: 'offsets', analyzer: :code_analyzer
|
28
31
|
indexes :language, type: :string, index: :not_analyzed
|
29
32
|
end
|
30
33
|
|
31
34
|
indexes :commit do
|
32
|
-
indexes :id, type: :string, index_options: 'offsets',
|
35
|
+
indexes :id, type: :string, index_options: 'offsets', analyzer: :human_analyzer
|
33
36
|
indexes :rid, type: :string, index: :not_analyzed
|
34
|
-
indexes :sha, type: :string, index_options: 'offsets',
|
37
|
+
indexes :sha, type: :string, index_options: 'offsets', analyzer: :sha_analyzer
|
35
38
|
|
36
39
|
indexes :author do
|
37
|
-
indexes :name, type: :string, index_options: 'offsets',
|
38
|
-
indexes :email, type: :string, index_options: 'offsets',
|
39
|
-
indexes :time, type: :date
|
40
|
+
indexes :name, type: :string, index_options: 'offsets', analyzer: :code_analyzer
|
41
|
+
indexes :email, type: :string, index_options: 'offsets', analyzer: :code_analyzer
|
42
|
+
indexes :time, type: :date, format: :basic_date_time_no_millis
|
40
43
|
end
|
41
44
|
|
42
45
|
indexes :commiter do
|
43
|
-
indexes :name, type: :string, index_options: 'offsets',
|
44
|
-
indexes :email, type: :string, index_options: 'offsets',
|
45
|
-
indexes :time, type: :date
|
46
|
+
indexes :name, type: :string, index_options: 'offsets', analyzer: :code_analyzer
|
47
|
+
indexes :email, type: :string, index_options: 'offsets', analyzer: :code_analyzer
|
48
|
+
indexes :time, type: :date, format: :basic_date_time_no_millis
|
46
49
|
end
|
47
50
|
|
48
|
-
indexes :message, type: :string, index_options: 'offsets',
|
51
|
+
indexes :message, type: :string, index_options: 'offsets', analyzer: :code_analyzer
|
49
52
|
end
|
50
53
|
end
|
51
54
|
|
@@ -68,52 +71,52 @@ module Elasticsearch
|
|
68
71
|
|
69
72
|
diff = repository_for_indexing.diff(from, to)
|
70
73
|
|
71
|
-
diff.deltas.reverse.
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
74
|
+
diff.deltas.reverse.each_slice(BLOBS_BATCH) do |slice|
|
75
|
+
bulk_operations = slice.map do |delta|
|
76
|
+
if delta.status == :deleted
|
77
|
+
next if delta.old_file[:mode].to_s(8) == "160000"
|
78
|
+
b = LiteBlob.new(repository_for_indexing, delta.old_file)
|
79
|
+
delete_blob(b)
|
80
|
+
else
|
81
|
+
next if delta.new_file[:mode].to_s(8) == "160000"
|
82
|
+
b = LiteBlob.new(repository_for_indexing, delta.new_file)
|
83
|
+
index_blob(b, to)
|
84
|
+
end
|
80
85
|
end
|
81
86
|
|
82
|
-
|
83
|
-
ObjectSpace.garbage_collect if step % 100 == 0
|
87
|
+
perform_bulk bulk_operations
|
84
88
|
end
|
85
89
|
end
|
86
90
|
|
91
|
+
def perform_bulk(bulk_operations)
|
92
|
+
client_for_indexing.bulk body: bulk_operations.compact
|
93
|
+
end
|
94
|
+
|
95
|
+
|
96
|
+
def delete_blob(blob)
|
97
|
+
return unless blob.text?
|
98
|
+
{ delete: { _index: "#{self.class.index_name}", _type: self.class.name.underscore, _id: "#{repository_id}_#{blob.path}" } }
|
99
|
+
end
|
100
|
+
|
101
|
+
|
87
102
|
def index_blob(blob, target_sha)
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
content: blob.data,
|
102
|
-
commit_sha: target_sha,
|
103
|
-
path: blob.path,
|
104
|
-
language: blob.language ? blob.language.name : "Text"
|
105
|
-
}
|
103
|
+
return unless can_index_blob?(blob)
|
104
|
+
{
|
105
|
+
index: {
|
106
|
+
_index: "#{self.class.index_name}", _type: self.class.name.underscore, _id: "#{repository_id}_#{blob.path}",
|
107
|
+
data: {
|
108
|
+
blob: {
|
109
|
+
type: "blob",
|
110
|
+
oid: blob.id,
|
111
|
+
rid: repository_id,
|
112
|
+
content: blob.data,
|
113
|
+
commit_sha: target_sha,
|
114
|
+
path: blob.path,
|
115
|
+
language: blob.language ? blob.language.name : "Text"
|
106
116
|
}
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
sleep 1
|
111
|
-
retry
|
112
|
-
else
|
113
|
-
raise CreateIndexException, "Can't index #{repository_id}_#{blob.path}. Reason: #{ex.message}"
|
114
|
-
end
|
115
|
-
end
|
116
|
-
end
|
117
|
+
}
|
118
|
+
}
|
119
|
+
}
|
117
120
|
end
|
118
121
|
|
119
122
|
# Index text-like files which size less 1.mb
|
@@ -121,21 +124,6 @@ module Elasticsearch
|
|
121
124
|
blob.text? && (blob.size && blob.size.to_i < 1048576)
|
122
125
|
end
|
123
126
|
|
124
|
-
def delete_from_index_blob(blob)
|
125
|
-
if blob.text?
|
126
|
-
begin
|
127
|
-
client_for_indexing.delete \
|
128
|
-
index: "#{self.class.index_name}",
|
129
|
-
type: "repository",
|
130
|
-
id: "#{repository_id}_#{blob.path}"
|
131
|
-
rescue Elasticsearch::Transport::Transport::Errors::NotFound
|
132
|
-
return true
|
133
|
-
rescue Exception => ex
|
134
|
-
raise CreateIndexException, "Error with removing file from index #{repository_id}_#{blob.path}. Reason: #{ex.message}"
|
135
|
-
end
|
136
|
-
end
|
137
|
-
end
|
138
|
-
|
139
127
|
# Indexing all commits in repository
|
140
128
|
#
|
141
129
|
# All data stored in global index
|
@@ -160,50 +148,65 @@ module Elasticsearch
|
|
160
148
|
# For search from commits use type 'commit'
|
161
149
|
def index_commits(from_rev: nil, to_rev: repository_for_indexing.last_commit.oid)
|
162
150
|
from, to = parse_revs(from_rev, to_rev)
|
163
|
-
range = [from, to].
|
151
|
+
range = [from, to].compact.join('..')
|
164
152
|
out, err, status = Open3.capture3("git log #{range} --format=\"%H\"", chdir: repository_for_indexing.path)
|
165
153
|
|
166
154
|
if status.success? && err.blank?
|
155
|
+
queue = Queue.new
|
156
|
+
|
157
|
+
workers = (0...3).map do
|
158
|
+
Thread.new do
|
159
|
+
while bulk_operations = queue.pop
|
160
|
+
perform_bulk bulk_operations
|
161
|
+
end
|
162
|
+
end
|
163
|
+
end
|
164
|
+
|
167
165
|
#TODO use rugged walker!!!
|
168
166
|
commit_oids = out.split("\n")
|
169
167
|
|
170
|
-
commit_oids.
|
171
|
-
|
172
|
-
|
168
|
+
commit_oids.each_slice(COMMMITS_BATCH) do |batch|
|
169
|
+
bulk_operations = batch.map do |commit|
|
170
|
+
index_commit(repository_for_indexing.lookup(commit))
|
171
|
+
end
|
172
|
+
|
173
|
+
# perform_bulk bulk_operations
|
174
|
+
queue << bulk_operations
|
175
|
+
end
|
176
|
+
|
177
|
+
while queue.num_waiting < workers.count
|
178
|
+
sleep 0.1
|
173
179
|
end
|
174
|
-
return commit_oids.count
|
175
|
-
end
|
176
180
|
|
177
|
-
|
181
|
+
# Kill off each thread now that they're idle and exit
|
182
|
+
workers.each(&:exit)
|
183
|
+
end
|
178
184
|
end
|
179
185
|
|
180
186
|
def index_commit(commit)
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
index: "#{self.class.index_name}",
|
186
|
-
type: self.class.name.underscore,
|
187
|
-
id: "#{repository_id}_#{commit.oid}",
|
188
|
-
body: {
|
187
|
+
{
|
188
|
+
index: {
|
189
|
+
_index: "#{self.class.index_name}", _type: self.class.name.underscore, _id: "#{repository_id}_#{commit.oid}",
|
190
|
+
data: {
|
189
191
|
commit: {
|
190
192
|
type: "commit",
|
191
193
|
rid: repository_id,
|
192
194
|
sha: commit.oid,
|
193
|
-
author:
|
194
|
-
|
195
|
+
author: {
|
196
|
+
name: commit.author[:name],
|
197
|
+
email: commit.author[:email],
|
198
|
+
time: commit.author[:time].strftime('%Y%m%dT%H%M%S%z'),
|
199
|
+
},
|
200
|
+
committer: {
|
201
|
+
name: commit.committer[:name],
|
202
|
+
email: commit.committer[:email],
|
203
|
+
time: commit.committer[:time].strftime('%Y%m%dT%H%M%S%z'),
|
204
|
+
},
|
195
205
|
message: encode!(commit.message)
|
196
206
|
}
|
197
207
|
}
|
198
|
-
|
199
|
-
|
200
|
-
tries += 1
|
201
|
-
sleep 1
|
202
|
-
retry
|
203
|
-
else
|
204
|
-
raise CreateIndexException, "Can't index #{repository_id}_#{commit.oid}. Reason: #{ex.message}"
|
205
|
-
end
|
206
|
-
end
|
208
|
+
}
|
209
|
+
}
|
207
210
|
end
|
208
211
|
|
209
212
|
def parse_revs(from_rev, to_rev)
|
@@ -350,7 +353,7 @@ module Elasticsearch
|
|
350
353
|
end
|
351
354
|
|
352
355
|
def client_for_indexing
|
353
|
-
@client_for_indexing ||= Elasticsearch::Client.new
|
356
|
+
@client_for_indexing ||= Elasticsearch::Client.new retry_on_failure: 5
|
354
357
|
end
|
355
358
|
|
356
359
|
def self.search(query, type: :all, page: 1, per: 20, options: {})
|
data/test/test_helper.rb
CHANGED
@@ -4,7 +4,7 @@ require 'pry'
|
|
4
4
|
require 'elasticsearch/git'
|
5
5
|
|
6
6
|
SUPPORT_PATH = File.join(File.expand_path(File.dirname(__FILE__)), '../support')
|
7
|
-
TEST_REPO_PATH = File.join(SUPPORT_PATH, '
|
7
|
+
TEST_REPO_PATH = File.join(SUPPORT_PATH, 'gitlab-test.git')
|
8
8
|
|
9
9
|
require_relative 'support/seed_helper'
|
10
10
|
require_relative 'support/repository'
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: gitlab-elasticsearch-git
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.8
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrey Kumanyaev
|
@@ -10,36 +10,36 @@ authors:
|
|
10
10
|
autorequire:
|
11
11
|
bindir: bin
|
12
12
|
cert_chain: []
|
13
|
-
date: 2016-01-
|
13
|
+
date: 2016-01-27 00:00:00.000000000 Z
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
16
16
|
name: elasticsearch-model
|
17
17
|
requirement: !ruby/object:Gem::Requirement
|
18
18
|
requirements:
|
19
|
-
- - "
|
19
|
+
- - "~>"
|
20
20
|
- !ruby/object:Gem::Version
|
21
|
-
version: '0'
|
21
|
+
version: '1.0'
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
24
|
version_requirements: !ruby/object:Gem::Requirement
|
25
25
|
requirements:
|
26
|
-
- - "
|
26
|
+
- - "~>"
|
27
27
|
- !ruby/object:Gem::Version
|
28
|
-
version: '0'
|
28
|
+
version: '1.0'
|
29
29
|
- !ruby/object:Gem::Dependency
|
30
30
|
name: elasticsearch-api
|
31
31
|
requirement: !ruby/object:Gem::Requirement
|
32
32
|
requirements:
|
33
33
|
- - "~>"
|
34
34
|
- !ruby/object:Gem::Version
|
35
|
-
version: 1.0
|
35
|
+
version: '1.0'
|
36
36
|
type: :runtime
|
37
37
|
prerelease: false
|
38
38
|
version_requirements: !ruby/object:Gem::Requirement
|
39
39
|
requirements:
|
40
40
|
- - "~>"
|
41
41
|
- !ruby/object:Gem::Version
|
42
|
-
version: 1.0
|
42
|
+
version: '1.0'
|
43
43
|
- !ruby/object:Gem::Dependency
|
44
44
|
name: rugged
|
45
45
|
requirement: !ruby/object:Gem::Requirement
|
@@ -60,56 +60,56 @@ dependencies:
|
|
60
60
|
requirements:
|
61
61
|
- - "~>"
|
62
62
|
- !ruby/object:Gem::Version
|
63
|
-
version: 0.7
|
63
|
+
version: '0.7'
|
64
64
|
type: :runtime
|
65
65
|
prerelease: false
|
66
66
|
version_requirements: !ruby/object:Gem::Requirement
|
67
67
|
requirements:
|
68
68
|
- - "~>"
|
69
69
|
- !ruby/object:Gem::Version
|
70
|
-
version: 0.7
|
70
|
+
version: '0.7'
|
71
71
|
- !ruby/object:Gem::Dependency
|
72
72
|
name: github-linguist
|
73
73
|
requirement: !ruby/object:Gem::Requirement
|
74
74
|
requirements:
|
75
75
|
- - "~>"
|
76
76
|
- !ruby/object:Gem::Version
|
77
|
-
version: 4.7
|
77
|
+
version: '4.7'
|
78
78
|
type: :runtime
|
79
79
|
prerelease: false
|
80
80
|
version_requirements: !ruby/object:Gem::Requirement
|
81
81
|
requirements:
|
82
82
|
- - "~>"
|
83
83
|
- !ruby/object:Gem::Version
|
84
|
-
version: 4.7
|
84
|
+
version: '4.7'
|
85
85
|
- !ruby/object:Gem::Dependency
|
86
86
|
name: activemodel
|
87
87
|
requirement: !ruby/object:Gem::Requirement
|
88
88
|
requirements:
|
89
89
|
- - "~>"
|
90
90
|
- !ruby/object:Gem::Version
|
91
|
-
version: 4.2
|
91
|
+
version: '4.2'
|
92
92
|
type: :runtime
|
93
93
|
prerelease: false
|
94
94
|
version_requirements: !ruby/object:Gem::Requirement
|
95
95
|
requirements:
|
96
96
|
- - "~>"
|
97
97
|
- !ruby/object:Gem::Version
|
98
|
-
version: 4.2
|
98
|
+
version: '4.2'
|
99
99
|
- !ruby/object:Gem::Dependency
|
100
100
|
name: activesupport
|
101
101
|
requirement: !ruby/object:Gem::Requirement
|
102
102
|
requirements:
|
103
103
|
- - "~>"
|
104
104
|
- !ruby/object:Gem::Version
|
105
|
-
version: 4.2
|
105
|
+
version: '4.2'
|
106
106
|
type: :runtime
|
107
107
|
prerelease: false
|
108
108
|
version_requirements: !ruby/object:Gem::Requirement
|
109
109
|
requirements:
|
110
110
|
- - "~>"
|
111
111
|
- !ruby/object:Gem::Version
|
112
|
-
version: 4.2
|
112
|
+
version: '4.2'
|
113
113
|
description: Elasticsearch integrations for indexing git repositories.
|
114
114
|
email:
|
115
115
|
- me@zzet.org
|
@@ -157,7 +157,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
157
157
|
version: '0'
|
158
158
|
requirements: []
|
159
159
|
rubyforge_project:
|
160
|
-
rubygems_version: 2.4.
|
160
|
+
rubygems_version: 2.4.8
|
161
161
|
signing_key:
|
162
162
|
specification_version: 4
|
163
163
|
summary: Elasticsearch integrations for git repositories.
|