gitlab-elasticsearch-git 0.0.7 → 0.0.8

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: a57ef9fc7d82e26ebcd5a4480c08828a1bb1bfb2
4
- data.tar.gz: 25631d2855e4077afe625cbfef70bf920af38eeb
3
+ metadata.gz: 407b4f13d2f80ccce7365e649318b811fcf3d164
4
+ data.tar.gz: 80651810beb864805f008c8ecc7d51201eb6b4fc
5
5
  SHA512:
6
- metadata.gz: 50924522805888f3991965ac6369f1ea8222cf8e87e8ae98110dcd49350c4f767ac03c953045b98e687a25412a831143cbaff55934dc83a879718a6dd952a967
7
- data.tar.gz: 6b80258d5e1bad79e9f28066fcc8bd32e406d256d2b10c11f0437cf8b699de6e4a8237dba64f5289e86daa8b398aefd07b059f8e4be0ba6e06ebc6dff05fade9
6
+ metadata.gz: 276867d9624cccaba15a97c0847222fe57958394023d5d7afdb52f7c3e9f315770d5f8dac4393f899c8130d8e545ebdaf128f6064503a981722c0951a00a0c8e
7
+ data.tar.gz: 6ad0348184aa893e97b379e31e6b2b9dcfea7002ddd8bd80e6b2ec211b734a369ecee4149e73c27e536f3666aa6dfe54d3dd993c6f4d9b1ccdbe78a94c4d2d70
data/CHANGELOG CHANGED
@@ -1,9 +1,17 @@
1
+ 0.0.8
2
+ - Using Elastic bulk API
3
+ - Optimisations of index mappings
4
+ - Performance optimization
5
+
6
+ 0.0.7
7
+ - Refactoring
8
+
1
9
  0.0.6
2
- - Support elasticsearch-model gem version 0.1.8
10
+ - Support elasticsearch-model gem version 0.1.8
3
11
 
4
12
  0.0.5
5
- - Search by file name on master branch
6
- - Migrate from elasticsearch 0.9.x -> 1.x
13
+ - Search by file name on master branch
14
+ - Migrate from elasticsearch 0.9.x -> 1.x
7
15
 
8
16
  0.0.4
9
- - Stable version
17
+ - Stable version
@@ -18,11 +18,11 @@ Gem::Specification.new do |spec|
18
18
  spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
19
  spec.require_paths = ["lib"]
20
20
 
21
- spec.add_runtime_dependency 'elasticsearch-model'
22
- spec.add_runtime_dependency 'elasticsearch-api', '~> 1.0.15'
21
+ spec.add_runtime_dependency 'elasticsearch-model', '~> 1.0'
22
+ spec.add_runtime_dependency 'elasticsearch-api', '~> 1.0'
23
23
  spec.add_runtime_dependency 'rugged', '~> 0.23.3'
24
- spec.add_runtime_dependency 'charlock_holmes', '~> 0.7.3'
25
- spec.add_runtime_dependency 'github-linguist', '~> 4.7.0'
26
- spec.add_runtime_dependency 'activemodel', '~> 4.2.0'
27
- spec.add_runtime_dependency 'activesupport', '~> 4.2.0'
24
+ spec.add_runtime_dependency 'charlock_holmes', '~> 0.7'
25
+ spec.add_runtime_dependency 'github-linguist', '~> 4.7'
26
+ spec.add_runtime_dependency 'activemodel', '~> 4.2'
27
+ spec.add_runtime_dependency 'activesupport', '~> 4.2'
28
28
  end
@@ -27,17 +27,17 @@ module Elasticsearch
27
27
  human_analyzer: {
28
28
  type: 'custom',
29
29
  tokenizer: 'human_tokenizer',
30
- filter: %w(lowercase asciifolding human_ngrams)
30
+ filter: %w(lowercase asciifolding)
31
31
  },
32
32
  path_analyzer: {
33
33
  type: 'custom',
34
34
  tokenizer: 'path_tokenizer',
35
- filter: %w(lowercase asciifolding path_ngrams)
35
+ filter: %w(lowercase asciifolding)
36
36
  },
37
37
  sha_analyzer: {
38
38
  type: 'custom',
39
39
  tokenizer: 'sha_tokenizer',
40
- filter: %w(lowercase asciifolding sha_ngrams)
40
+ filter: %w(lowercase asciifolding)
41
41
  },
42
42
  code_analyzer: {
43
43
  type: 'custom',
@@ -64,21 +64,6 @@ module Elasticsearch
64
64
  },
65
65
  },
66
66
  filter: {
67
- human_ngrams: {
68
- type: "nGram",
69
- min_gram: 1,
70
- max_gram: 20
71
- },
72
- sha_ngrams: {
73
- type: "edgeNGram",
74
- min_gram: 8,
75
- max_gram: 40
76
- },
77
- path_ngrams: {
78
- type: "edgeNGram",
79
- min_gram: 3,
80
- max_gram: 15
81
- },
82
67
  code_stemmer: {
83
68
  type: "stemmer",
84
69
  name: "minimal_english"
@@ -11,6 +11,9 @@ module Elasticsearch
11
11
  module Repository
12
12
  class CreateIndexException < StandardError; end
13
13
 
14
+ BLOBS_BATCH = 100
15
+ COMMMITS_BATCH = 500
16
+
14
17
  extend ActiveSupport::Concern
15
18
 
16
19
  included do
@@ -19,33 +22,33 @@ module Elasticsearch
19
22
 
20
23
  mapping _timestamp: { enabled: true } do
21
24
  indexes :blob do
22
- indexes :id, type: :string, index_options: 'offsets', search_analyzer: :human_analyzer, analyzer: :human_analyzer
25
+ indexes :id, type: :string, index_options: 'offsets', analyzer: :human_analyzer
23
26
  indexes :rid, type: :string, index: :not_analyzed
24
- indexes :oid, type: :string, index_options: 'offsets', search_analyzer: :code_analyzer, analyzer: :code_analyzer
25
- indexes :commit_sha, type: :string, index_options: 'offsets', search_analyzer: :code_analyzer, analyzer: :code_analyzer
26
- indexes :path, type: :string, search_analyzer: :path_analyzer, analyzer: :path_analyzer
27
- indexes :content, type: :string, index_options: 'offsets', search_analyzer: :code_analyzer, analyzer: :code_analyzer
27
+ indexes :oid, type: :string, index_options: 'offsets', analyzer: :code_analyzer
28
+ indexes :commit_sha, type: :string, index_options: 'offsets', analyzer: :sha_analyzer
29
+ indexes :path, type: :string, analyzer: :path_analyzer
30
+ indexes :content, type: :string, index_options: 'offsets', analyzer: :code_analyzer
28
31
  indexes :language, type: :string, index: :not_analyzed
29
32
  end
30
33
 
31
34
  indexes :commit do
32
- indexes :id, type: :string, index_options: 'offsets', search_analyzer: :human_analyzer, analyzer: :human_analyzer
35
+ indexes :id, type: :string, index_options: 'offsets', analyzer: :human_analyzer
33
36
  indexes :rid, type: :string, index: :not_analyzed
34
- indexes :sha, type: :string, index_options: 'offsets', search_analyzer: :code_analyzer, analyzer: :code_analyzer
37
+ indexes :sha, type: :string, index_options: 'offsets', analyzer: :sha_analyzer
35
38
 
36
39
  indexes :author do
37
- indexes :name, type: :string, index_options: 'offsets', search_analyzer: :code_analyzer, analyzer: :code_analyzer
38
- indexes :email, type: :string, index_options: 'offsets', search_analyzer: :code_analyzer, analyzer: :code_analyzer
39
- indexes :time, type: :date
40
+ indexes :name, type: :string, index_options: 'offsets', analyzer: :code_analyzer
41
+ indexes :email, type: :string, index_options: 'offsets', analyzer: :code_analyzer
42
+ indexes :time, type: :date, format: :basic_date_time_no_millis
40
43
  end
41
44
 
42
45
  indexes :commiter do
43
- indexes :name, type: :string, index_options: 'offsets', search_analyzer: :code_analyzer, analyzer: :code_analyzer
44
- indexes :email, type: :string, index_options: 'offsets', search_analyzer: :code_analyzer, analyzer: :code_analyzer
45
- indexes :time, type: :date
46
+ indexes :name, type: :string, index_options: 'offsets', analyzer: :code_analyzer
47
+ indexes :email, type: :string, index_options: 'offsets', analyzer: :code_analyzer
48
+ indexes :time, type: :date, format: :basic_date_time_no_millis
46
49
  end
47
50
 
48
- indexes :message, type: :string, index_options: 'offsets', search_analyzer: :code_analyzer, analyzer: :code_analyzer
51
+ indexes :message, type: :string, index_options: 'offsets', analyzer: :code_analyzer
49
52
  end
50
53
  end
51
54
 
@@ -68,52 +71,52 @@ module Elasticsearch
68
71
 
69
72
  diff = repository_for_indexing.diff(from, to)
70
73
 
71
- diff.deltas.reverse.each_with_index do |delta, step|
72
- if delta.status == :deleted
73
- next if delta.old_file[:mode].to_s(8) == "160000"
74
- b = LiteBlob.new(repository_for_indexing, delta.old_file)
75
- delete_from_index_blob(b)
76
- else
77
- next if delta.new_file[:mode].to_s(8) == "160000"
78
- b = LiteBlob.new(repository_for_indexing, delta.new_file)
79
- index_blob(b, to)
74
+ diff.deltas.reverse.each_slice(BLOBS_BATCH) do |slice|
75
+ bulk_operations = slice.map do |delta|
76
+ if delta.status == :deleted
77
+ next if delta.old_file[:mode].to_s(8) == "160000"
78
+ b = LiteBlob.new(repository_for_indexing, delta.old_file)
79
+ delete_blob(b)
80
+ else
81
+ next if delta.new_file[:mode].to_s(8) == "160000"
82
+ b = LiteBlob.new(repository_for_indexing, delta.new_file)
83
+ index_blob(b, to)
84
+ end
80
85
  end
81
86
 
82
- # Run GC every 100 blobs
83
- ObjectSpace.garbage_collect if step % 100 == 0
87
+ perform_bulk bulk_operations
84
88
  end
85
89
  end
86
90
 
91
+ def perform_bulk(bulk_operations)
92
+ client_for_indexing.bulk body: bulk_operations.compact
93
+ end
94
+
95
+
96
+ def delete_blob(blob)
97
+ return unless blob.text?
98
+ { delete: { _index: "#{self.class.index_name}", _type: self.class.name.underscore, _id: "#{repository_id}_#{blob.path}" } }
99
+ end
100
+
101
+
87
102
  def index_blob(blob, target_sha)
88
- if can_index_blob?(blob)
89
- tries = 0
90
-
91
- begin
92
- client_for_indexing.index \
93
- index: "#{self.class.index_name}",
94
- type: self.class.name.underscore,
95
- id: "#{repository_id}_#{blob.path}",
96
- body: {
97
- blob: {
98
- type: "blob",
99
- oid: blob.id,
100
- rid: repository_id,
101
- content: blob.data,
102
- commit_sha: target_sha,
103
- path: blob.path,
104
- language: blob.language ? blob.language.name : "Text"
105
- }
103
+ return unless can_index_blob?(blob)
104
+ {
105
+ index: {
106
+ _index: "#{self.class.index_name}", _type: self.class.name.underscore, _id: "#{repository_id}_#{blob.path}",
107
+ data: {
108
+ blob: {
109
+ type: "blob",
110
+ oid: blob.id,
111
+ rid: repository_id,
112
+ content: blob.data,
113
+ commit_sha: target_sha,
114
+ path: blob.path,
115
+ language: blob.language ? blob.language.name : "Text"
106
116
  }
107
- rescue Exception => ex
108
- if tries < 2
109
- tries += 1
110
- sleep 1
111
- retry
112
- else
113
- raise CreateIndexException, "Can't index #{repository_id}_#{blob.path}. Reason: #{ex.message}"
114
- end
115
- end
116
- end
117
+ }
118
+ }
119
+ }
117
120
  end
118
121
 
119
122
  # Index text-like files which size less 1.mb
@@ -121,21 +124,6 @@ module Elasticsearch
121
124
  blob.text? && (blob.size && blob.size.to_i < 1048576)
122
125
  end
123
126
 
124
- def delete_from_index_blob(blob)
125
- if blob.text?
126
- begin
127
- client_for_indexing.delete \
128
- index: "#{self.class.index_name}",
129
- type: "repository",
130
- id: "#{repository_id}_#{blob.path}"
131
- rescue Elasticsearch::Transport::Transport::Errors::NotFound
132
- return true
133
- rescue Exception => ex
134
- raise CreateIndexException, "Error with removing file from index #{repository_id}_#{blob.path}. Reason: #{ex.message}"
135
- end
136
- end
137
- end
138
-
139
127
  # Indexing all commits in repository
140
128
  #
141
129
  # All data stored in global index
@@ -160,50 +148,65 @@ module Elasticsearch
160
148
  # For search from commits use type 'commit'
161
149
  def index_commits(from_rev: nil, to_rev: repository_for_indexing.last_commit.oid)
162
150
  from, to = parse_revs(from_rev, to_rev)
163
- range = [from, to].reject(&:nil?).join('..')
151
+ range = [from, to].compact.join('..')
164
152
  out, err, status = Open3.capture3("git log #{range} --format=\"%H\"", chdir: repository_for_indexing.path)
165
153
 
166
154
  if status.success? && err.blank?
155
+ queue = Queue.new
156
+
157
+ workers = (0...3).map do
158
+ Thread.new do
159
+ while bulk_operations = queue.pop
160
+ perform_bulk bulk_operations
161
+ end
162
+ end
163
+ end
164
+
167
165
  #TODO use rugged walker!!!
168
166
  commit_oids = out.split("\n")
169
167
 
170
- commit_oids.each_with_index do |commit, step|
171
- index_commit(repository_for_indexing.lookup(commit))
172
- ObjectSpace.garbage_collect if step % 100 == 0
168
+ commit_oids.each_slice(COMMMITS_BATCH) do |batch|
169
+ bulk_operations = batch.map do |commit|
170
+ index_commit(repository_for_indexing.lookup(commit))
171
+ end
172
+
173
+ # perform_bulk bulk_operations
174
+ queue << bulk_operations
175
+ end
176
+
177
+ while queue.num_waiting < workers.count
178
+ sleep 0.1
173
179
  end
174
- return commit_oids.count
175
- end
176
180
 
177
- 0
181
+ # Kill off each thread now that they're idle and exit
182
+ workers.each(&:exit)
183
+ end
178
184
  end
179
185
 
180
186
  def index_commit(commit)
181
- tries = 0
182
-
183
- begin
184
- client_for_indexing.index \
185
- index: "#{self.class.index_name}",
186
- type: self.class.name.underscore,
187
- id: "#{repository_id}_#{commit.oid}",
188
- body: {
187
+ {
188
+ index: {
189
+ _index: "#{self.class.index_name}", _type: self.class.name.underscore, _id: "#{repository_id}_#{commit.oid}",
190
+ data: {
189
191
  commit: {
190
192
  type: "commit",
191
193
  rid: repository_id,
192
194
  sha: commit.oid,
193
- author: commit.author,
194
- committer: commit.committer,
195
+ author: {
196
+ name: commit.author[:name],
197
+ email: commit.author[:email],
198
+ time: commit.author[:time].strftime('%Y%m%dT%H%M%S%z'),
199
+ },
200
+ committer: {
201
+ name: commit.committer[:name],
202
+ email: commit.committer[:email],
203
+ time: commit.committer[:time].strftime('%Y%m%dT%H%M%S%z'),
204
+ },
195
205
  message: encode!(commit.message)
196
206
  }
197
207
  }
198
- rescue Exception => ex
199
- if tries < 2
200
- tries += 1
201
- sleep 1
202
- retry
203
- else
204
- raise CreateIndexException, "Can't index #{repository_id}_#{commit.oid}. Reason: #{ex.message}"
205
- end
206
- end
208
+ }
209
+ }
207
210
  end
208
211
 
209
212
  def parse_revs(from_rev, to_rev)
@@ -350,7 +353,7 @@ module Elasticsearch
350
353
  end
351
354
 
352
355
  def client_for_indexing
353
- @client_for_indexing ||= Elasticsearch::Client.new log: true
356
+ @client_for_indexing ||= Elasticsearch::Client.new retry_on_failure: 5
354
357
  end
355
358
 
356
359
  def self.search(query, type: :all, page: 1, per: 20, options: {})
@@ -1,5 +1,5 @@
1
1
  module Elasticsearch
2
2
  module Git
3
- VERSION = "0.0.7"
3
+ VERSION = "0.0.8"
4
4
  end
5
5
  end
data/test/test_helper.rb CHANGED
@@ -4,7 +4,7 @@ require 'pry'
4
4
  require 'elasticsearch/git'
5
5
 
6
6
  SUPPORT_PATH = File.join(File.expand_path(File.dirname(__FILE__)), '../support')
7
- TEST_REPO_PATH = File.join(SUPPORT_PATH, 'testme.git')
7
+ TEST_REPO_PATH = File.join(SUPPORT_PATH, 'gitlab-test.git')
8
8
 
9
9
  require_relative 'support/seed_helper'
10
10
  require_relative 'support/repository'
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: gitlab-elasticsearch-git
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.7
4
+ version: 0.0.8
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrey Kumanyaev
@@ -10,36 +10,36 @@ authors:
10
10
  autorequire:
11
11
  bindir: bin
12
12
  cert_chain: []
13
- date: 2016-01-17 00:00:00.000000000 Z
13
+ date: 2016-01-27 00:00:00.000000000 Z
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
16
16
  name: elasticsearch-model
17
17
  requirement: !ruby/object:Gem::Requirement
18
18
  requirements:
19
- - - ">="
19
+ - - "~>"
20
20
  - !ruby/object:Gem::Version
21
- version: '0'
21
+ version: '1.0'
22
22
  type: :runtime
23
23
  prerelease: false
24
24
  version_requirements: !ruby/object:Gem::Requirement
25
25
  requirements:
26
- - - ">="
26
+ - - "~>"
27
27
  - !ruby/object:Gem::Version
28
- version: '0'
28
+ version: '1.0'
29
29
  - !ruby/object:Gem::Dependency
30
30
  name: elasticsearch-api
31
31
  requirement: !ruby/object:Gem::Requirement
32
32
  requirements:
33
33
  - - "~>"
34
34
  - !ruby/object:Gem::Version
35
- version: 1.0.15
35
+ version: '1.0'
36
36
  type: :runtime
37
37
  prerelease: false
38
38
  version_requirements: !ruby/object:Gem::Requirement
39
39
  requirements:
40
40
  - - "~>"
41
41
  - !ruby/object:Gem::Version
42
- version: 1.0.15
42
+ version: '1.0'
43
43
  - !ruby/object:Gem::Dependency
44
44
  name: rugged
45
45
  requirement: !ruby/object:Gem::Requirement
@@ -60,56 +60,56 @@ dependencies:
60
60
  requirements:
61
61
  - - "~>"
62
62
  - !ruby/object:Gem::Version
63
- version: 0.7.3
63
+ version: '0.7'
64
64
  type: :runtime
65
65
  prerelease: false
66
66
  version_requirements: !ruby/object:Gem::Requirement
67
67
  requirements:
68
68
  - - "~>"
69
69
  - !ruby/object:Gem::Version
70
- version: 0.7.3
70
+ version: '0.7'
71
71
  - !ruby/object:Gem::Dependency
72
72
  name: github-linguist
73
73
  requirement: !ruby/object:Gem::Requirement
74
74
  requirements:
75
75
  - - "~>"
76
76
  - !ruby/object:Gem::Version
77
- version: 4.7.0
77
+ version: '4.7'
78
78
  type: :runtime
79
79
  prerelease: false
80
80
  version_requirements: !ruby/object:Gem::Requirement
81
81
  requirements:
82
82
  - - "~>"
83
83
  - !ruby/object:Gem::Version
84
- version: 4.7.0
84
+ version: '4.7'
85
85
  - !ruby/object:Gem::Dependency
86
86
  name: activemodel
87
87
  requirement: !ruby/object:Gem::Requirement
88
88
  requirements:
89
89
  - - "~>"
90
90
  - !ruby/object:Gem::Version
91
- version: 4.2.0
91
+ version: '4.2'
92
92
  type: :runtime
93
93
  prerelease: false
94
94
  version_requirements: !ruby/object:Gem::Requirement
95
95
  requirements:
96
96
  - - "~>"
97
97
  - !ruby/object:Gem::Version
98
- version: 4.2.0
98
+ version: '4.2'
99
99
  - !ruby/object:Gem::Dependency
100
100
  name: activesupport
101
101
  requirement: !ruby/object:Gem::Requirement
102
102
  requirements:
103
103
  - - "~>"
104
104
  - !ruby/object:Gem::Version
105
- version: 4.2.0
105
+ version: '4.2'
106
106
  type: :runtime
107
107
  prerelease: false
108
108
  version_requirements: !ruby/object:Gem::Requirement
109
109
  requirements:
110
110
  - - "~>"
111
111
  - !ruby/object:Gem::Version
112
- version: 4.2.0
112
+ version: '4.2'
113
113
  description: Elasticsearch integrations for indexing git repositories.
114
114
  email:
115
115
  - me@zzet.org
@@ -157,7 +157,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
157
157
  version: '0'
158
158
  requirements: []
159
159
  rubyforge_project:
160
- rubygems_version: 2.4.3
160
+ rubygems_version: 2.4.8
161
161
  signing_key:
162
162
  specification_version: 4
163
163
  summary: Elasticsearch integrations for git repositories.