gitlab-elasticsearch-git 0.0.7 → 0.0.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: a57ef9fc7d82e26ebcd5a4480c08828a1bb1bfb2
4
- data.tar.gz: 25631d2855e4077afe625cbfef70bf920af38eeb
3
+ metadata.gz: 407b4f13d2f80ccce7365e649318b811fcf3d164
4
+ data.tar.gz: 80651810beb864805f008c8ecc7d51201eb6b4fc
5
5
  SHA512:
6
- metadata.gz: 50924522805888f3991965ac6369f1ea8222cf8e87e8ae98110dcd49350c4f767ac03c953045b98e687a25412a831143cbaff55934dc83a879718a6dd952a967
7
- data.tar.gz: 6b80258d5e1bad79e9f28066fcc8bd32e406d256d2b10c11f0437cf8b699de6e4a8237dba64f5289e86daa8b398aefd07b059f8e4be0ba6e06ebc6dff05fade9
6
+ metadata.gz: 276867d9624cccaba15a97c0847222fe57958394023d5d7afdb52f7c3e9f315770d5f8dac4393f899c8130d8e545ebdaf128f6064503a981722c0951a00a0c8e
7
+ data.tar.gz: 6ad0348184aa893e97b379e31e6b2b9dcfea7002ddd8bd80e6b2ec211b734a369ecee4149e73c27e536f3666aa6dfe54d3dd993c6f4d9b1ccdbe78a94c4d2d70
data/CHANGELOG CHANGED
@@ -1,9 +1,17 @@
1
+ 0.0.8
2
+ - Using Elastic bulk API
3
+ - Optimisations of index mappings
4
+ - Performance optimization
5
+
6
+ 0.0.7
7
+ - Refactoring
8
+
1
9
  0.0.6
2
- - Support elasticsearch-model gem version 0.1.8
10
+ - Support elasticsearch-model gem version 0.1.8
3
11
 
4
12
  0.0.5
5
- - Search by file name on master branch
6
- - Migrate from elasticsearch 0.9.x -> 1.x
13
+ - Search by file name on master branch
14
+ - Migrate from elasticsearch 0.9.x -> 1.x
7
15
 
8
16
  0.0.4
9
- - Stable version
17
+ - Stable version
@@ -18,11 +18,11 @@ Gem::Specification.new do |spec|
18
18
  spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
19
  spec.require_paths = ["lib"]
20
20
 
21
- spec.add_runtime_dependency 'elasticsearch-model'
22
- spec.add_runtime_dependency 'elasticsearch-api', '~> 1.0.15'
21
+ spec.add_runtime_dependency 'elasticsearch-model', '~> 1.0'
22
+ spec.add_runtime_dependency 'elasticsearch-api', '~> 1.0'
23
23
  spec.add_runtime_dependency 'rugged', '~> 0.23.3'
24
- spec.add_runtime_dependency 'charlock_holmes', '~> 0.7.3'
25
- spec.add_runtime_dependency 'github-linguist', '~> 4.7.0'
26
- spec.add_runtime_dependency 'activemodel', '~> 4.2.0'
27
- spec.add_runtime_dependency 'activesupport', '~> 4.2.0'
24
+ spec.add_runtime_dependency 'charlock_holmes', '~> 0.7'
25
+ spec.add_runtime_dependency 'github-linguist', '~> 4.7'
26
+ spec.add_runtime_dependency 'activemodel', '~> 4.2'
27
+ spec.add_runtime_dependency 'activesupport', '~> 4.2'
28
28
  end
@@ -27,17 +27,17 @@ module Elasticsearch
27
27
  human_analyzer: {
28
28
  type: 'custom',
29
29
  tokenizer: 'human_tokenizer',
30
- filter: %w(lowercase asciifolding human_ngrams)
30
+ filter: %w(lowercase asciifolding)
31
31
  },
32
32
  path_analyzer: {
33
33
  type: 'custom',
34
34
  tokenizer: 'path_tokenizer',
35
- filter: %w(lowercase asciifolding path_ngrams)
35
+ filter: %w(lowercase asciifolding)
36
36
  },
37
37
  sha_analyzer: {
38
38
  type: 'custom',
39
39
  tokenizer: 'sha_tokenizer',
40
- filter: %w(lowercase asciifolding sha_ngrams)
40
+ filter: %w(lowercase asciifolding)
41
41
  },
42
42
  code_analyzer: {
43
43
  type: 'custom',
@@ -64,21 +64,6 @@ module Elasticsearch
64
64
  },
65
65
  },
66
66
  filter: {
67
- human_ngrams: {
68
- type: "nGram",
69
- min_gram: 1,
70
- max_gram: 20
71
- },
72
- sha_ngrams: {
73
- type: "edgeNGram",
74
- min_gram: 8,
75
- max_gram: 40
76
- },
77
- path_ngrams: {
78
- type: "edgeNGram",
79
- min_gram: 3,
80
- max_gram: 15
81
- },
82
67
  code_stemmer: {
83
68
  type: "stemmer",
84
69
  name: "minimal_english"
@@ -11,6 +11,9 @@ module Elasticsearch
11
11
  module Repository
12
12
  class CreateIndexException < StandardError; end
13
13
 
14
+ BLOBS_BATCH = 100
15
+ COMMMITS_BATCH = 500
16
+
14
17
  extend ActiveSupport::Concern
15
18
 
16
19
  included do
@@ -19,33 +22,33 @@ module Elasticsearch
19
22
 
20
23
  mapping _timestamp: { enabled: true } do
21
24
  indexes :blob do
22
- indexes :id, type: :string, index_options: 'offsets', search_analyzer: :human_analyzer, analyzer: :human_analyzer
25
+ indexes :id, type: :string, index_options: 'offsets', analyzer: :human_analyzer
23
26
  indexes :rid, type: :string, index: :not_analyzed
24
- indexes :oid, type: :string, index_options: 'offsets', search_analyzer: :code_analyzer, analyzer: :code_analyzer
25
- indexes :commit_sha, type: :string, index_options: 'offsets', search_analyzer: :code_analyzer, analyzer: :code_analyzer
26
- indexes :path, type: :string, search_analyzer: :path_analyzer, analyzer: :path_analyzer
27
- indexes :content, type: :string, index_options: 'offsets', search_analyzer: :code_analyzer, analyzer: :code_analyzer
27
+ indexes :oid, type: :string, index_options: 'offsets', analyzer: :code_analyzer
28
+ indexes :commit_sha, type: :string, index_options: 'offsets', analyzer: :sha_analyzer
29
+ indexes :path, type: :string, analyzer: :path_analyzer
30
+ indexes :content, type: :string, index_options: 'offsets', analyzer: :code_analyzer
28
31
  indexes :language, type: :string, index: :not_analyzed
29
32
  end
30
33
 
31
34
  indexes :commit do
32
- indexes :id, type: :string, index_options: 'offsets', search_analyzer: :human_analyzer, analyzer: :human_analyzer
35
+ indexes :id, type: :string, index_options: 'offsets', analyzer: :human_analyzer
33
36
  indexes :rid, type: :string, index: :not_analyzed
34
- indexes :sha, type: :string, index_options: 'offsets', search_analyzer: :code_analyzer, analyzer: :code_analyzer
37
+ indexes :sha, type: :string, index_options: 'offsets', analyzer: :sha_analyzer
35
38
 
36
39
  indexes :author do
37
- indexes :name, type: :string, index_options: 'offsets', search_analyzer: :code_analyzer, analyzer: :code_analyzer
38
- indexes :email, type: :string, index_options: 'offsets', search_analyzer: :code_analyzer, analyzer: :code_analyzer
39
- indexes :time, type: :date
40
+ indexes :name, type: :string, index_options: 'offsets', analyzer: :code_analyzer
41
+ indexes :email, type: :string, index_options: 'offsets', analyzer: :code_analyzer
42
+ indexes :time, type: :date, format: :basic_date_time_no_millis
40
43
  end
41
44
 
42
45
  indexes :commiter do
43
- indexes :name, type: :string, index_options: 'offsets', search_analyzer: :code_analyzer, analyzer: :code_analyzer
44
- indexes :email, type: :string, index_options: 'offsets', search_analyzer: :code_analyzer, analyzer: :code_analyzer
45
- indexes :time, type: :date
46
+ indexes :name, type: :string, index_options: 'offsets', analyzer: :code_analyzer
47
+ indexes :email, type: :string, index_options: 'offsets', analyzer: :code_analyzer
48
+ indexes :time, type: :date, format: :basic_date_time_no_millis
46
49
  end
47
50
 
48
- indexes :message, type: :string, index_options: 'offsets', search_analyzer: :code_analyzer, analyzer: :code_analyzer
51
+ indexes :message, type: :string, index_options: 'offsets', analyzer: :code_analyzer
49
52
  end
50
53
  end
51
54
 
@@ -68,52 +71,52 @@ module Elasticsearch
68
71
 
69
72
  diff = repository_for_indexing.diff(from, to)
70
73
 
71
- diff.deltas.reverse.each_with_index do |delta, step|
72
- if delta.status == :deleted
73
- next if delta.old_file[:mode].to_s(8) == "160000"
74
- b = LiteBlob.new(repository_for_indexing, delta.old_file)
75
- delete_from_index_blob(b)
76
- else
77
- next if delta.new_file[:mode].to_s(8) == "160000"
78
- b = LiteBlob.new(repository_for_indexing, delta.new_file)
79
- index_blob(b, to)
74
+ diff.deltas.reverse.each_slice(BLOBS_BATCH) do |slice|
75
+ bulk_operations = slice.map do |delta|
76
+ if delta.status == :deleted
77
+ next if delta.old_file[:mode].to_s(8) == "160000"
78
+ b = LiteBlob.new(repository_for_indexing, delta.old_file)
79
+ delete_blob(b)
80
+ else
81
+ next if delta.new_file[:mode].to_s(8) == "160000"
82
+ b = LiteBlob.new(repository_for_indexing, delta.new_file)
83
+ index_blob(b, to)
84
+ end
80
85
  end
81
86
 
82
- # Run GC every 100 blobs
83
- ObjectSpace.garbage_collect if step % 100 == 0
87
+ perform_bulk bulk_operations
84
88
  end
85
89
  end
86
90
 
91
+ def perform_bulk(bulk_operations)
92
+ client_for_indexing.bulk body: bulk_operations.compact
93
+ end
94
+
95
+
96
+ def delete_blob(blob)
97
+ return unless blob.text?
98
+ { delete: { _index: "#{self.class.index_name}", _type: self.class.name.underscore, _id: "#{repository_id}_#{blob.path}" } }
99
+ end
100
+
101
+
87
102
  def index_blob(blob, target_sha)
88
- if can_index_blob?(blob)
89
- tries = 0
90
-
91
- begin
92
- client_for_indexing.index \
93
- index: "#{self.class.index_name}",
94
- type: self.class.name.underscore,
95
- id: "#{repository_id}_#{blob.path}",
96
- body: {
97
- blob: {
98
- type: "blob",
99
- oid: blob.id,
100
- rid: repository_id,
101
- content: blob.data,
102
- commit_sha: target_sha,
103
- path: blob.path,
104
- language: blob.language ? blob.language.name : "Text"
105
- }
103
+ return unless can_index_blob?(blob)
104
+ {
105
+ index: {
106
+ _index: "#{self.class.index_name}", _type: self.class.name.underscore, _id: "#{repository_id}_#{blob.path}",
107
+ data: {
108
+ blob: {
109
+ type: "blob",
110
+ oid: blob.id,
111
+ rid: repository_id,
112
+ content: blob.data,
113
+ commit_sha: target_sha,
114
+ path: blob.path,
115
+ language: blob.language ? blob.language.name : "Text"
106
116
  }
107
- rescue Exception => ex
108
- if tries < 2
109
- tries += 1
110
- sleep 1
111
- retry
112
- else
113
- raise CreateIndexException, "Can't index #{repository_id}_#{blob.path}. Reason: #{ex.message}"
114
- end
115
- end
116
- end
117
+ }
118
+ }
119
+ }
117
120
  end
118
121
 
119
122
  # Index text-like files which size less 1.mb
@@ -121,21 +124,6 @@ module Elasticsearch
121
124
  blob.text? && (blob.size && blob.size.to_i < 1048576)
122
125
  end
123
126
 
124
- def delete_from_index_blob(blob)
125
- if blob.text?
126
- begin
127
- client_for_indexing.delete \
128
- index: "#{self.class.index_name}",
129
- type: "repository",
130
- id: "#{repository_id}_#{blob.path}"
131
- rescue Elasticsearch::Transport::Transport::Errors::NotFound
132
- return true
133
- rescue Exception => ex
134
- raise CreateIndexException, "Error with removing file from index #{repository_id}_#{blob.path}. Reason: #{ex.message}"
135
- end
136
- end
137
- end
138
-
139
127
  # Indexing all commits in repository
140
128
  #
141
129
  # All data stored in global index
@@ -160,50 +148,65 @@ module Elasticsearch
160
148
  # For search from commits use type 'commit'
161
149
  def index_commits(from_rev: nil, to_rev: repository_for_indexing.last_commit.oid)
162
150
  from, to = parse_revs(from_rev, to_rev)
163
- range = [from, to].reject(&:nil?).join('..')
151
+ range = [from, to].compact.join('..')
164
152
  out, err, status = Open3.capture3("git log #{range} --format=\"%H\"", chdir: repository_for_indexing.path)
165
153
 
166
154
  if status.success? && err.blank?
155
+ queue = Queue.new
156
+
157
+ workers = (0...3).map do
158
+ Thread.new do
159
+ while bulk_operations = queue.pop
160
+ perform_bulk bulk_operations
161
+ end
162
+ end
163
+ end
164
+
167
165
  #TODO use rugged walker!!!
168
166
  commit_oids = out.split("\n")
169
167
 
170
- commit_oids.each_with_index do |commit, step|
171
- index_commit(repository_for_indexing.lookup(commit))
172
- ObjectSpace.garbage_collect if step % 100 == 0
168
+ commit_oids.each_slice(COMMMITS_BATCH) do |batch|
169
+ bulk_operations = batch.map do |commit|
170
+ index_commit(repository_for_indexing.lookup(commit))
171
+ end
172
+
173
+ # perform_bulk bulk_operations
174
+ queue << bulk_operations
175
+ end
176
+
177
+ while queue.num_waiting < workers.count
178
+ sleep 0.1
173
179
  end
174
- return commit_oids.count
175
- end
176
180
 
177
- 0
181
+ # Kill off each thread now that they're idle and exit
182
+ workers.each(&:exit)
183
+ end
178
184
  end
179
185
 
180
186
  def index_commit(commit)
181
- tries = 0
182
-
183
- begin
184
- client_for_indexing.index \
185
- index: "#{self.class.index_name}",
186
- type: self.class.name.underscore,
187
- id: "#{repository_id}_#{commit.oid}",
188
- body: {
187
+ {
188
+ index: {
189
+ _index: "#{self.class.index_name}", _type: self.class.name.underscore, _id: "#{repository_id}_#{commit.oid}",
190
+ data: {
189
191
  commit: {
190
192
  type: "commit",
191
193
  rid: repository_id,
192
194
  sha: commit.oid,
193
- author: commit.author,
194
- committer: commit.committer,
195
+ author: {
196
+ name: commit.author[:name],
197
+ email: commit.author[:email],
198
+ time: commit.author[:time].strftime('%Y%m%dT%H%M%S%z'),
199
+ },
200
+ committer: {
201
+ name: commit.committer[:name],
202
+ email: commit.committer[:email],
203
+ time: commit.committer[:time].strftime('%Y%m%dT%H%M%S%z'),
204
+ },
195
205
  message: encode!(commit.message)
196
206
  }
197
207
  }
198
- rescue Exception => ex
199
- if tries < 2
200
- tries += 1
201
- sleep 1
202
- retry
203
- else
204
- raise CreateIndexException, "Can't index #{repository_id}_#{commit.oid}. Reason: #{ex.message}"
205
- end
206
- end
208
+ }
209
+ }
207
210
  end
208
211
 
209
212
  def parse_revs(from_rev, to_rev)
@@ -350,7 +353,7 @@ module Elasticsearch
350
353
  end
351
354
 
352
355
  def client_for_indexing
353
- @client_for_indexing ||= Elasticsearch::Client.new log: true
356
+ @client_for_indexing ||= Elasticsearch::Client.new retry_on_failure: 5
354
357
  end
355
358
 
356
359
  def self.search(query, type: :all, page: 1, per: 20, options: {})
@@ -1,5 +1,5 @@
1
1
  module Elasticsearch
2
2
  module Git
3
- VERSION = "0.0.7"
3
+ VERSION = "0.0.8"
4
4
  end
5
5
  end
data/test/test_helper.rb CHANGED
@@ -4,7 +4,7 @@ require 'pry'
4
4
  require 'elasticsearch/git'
5
5
 
6
6
  SUPPORT_PATH = File.join(File.expand_path(File.dirname(__FILE__)), '../support')
7
- TEST_REPO_PATH = File.join(SUPPORT_PATH, 'testme.git')
7
+ TEST_REPO_PATH = File.join(SUPPORT_PATH, 'gitlab-test.git')
8
8
 
9
9
  require_relative 'support/seed_helper'
10
10
  require_relative 'support/repository'
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: gitlab-elasticsearch-git
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.7
4
+ version: 0.0.8
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrey Kumanyaev
@@ -10,36 +10,36 @@ authors:
10
10
  autorequire:
11
11
  bindir: bin
12
12
  cert_chain: []
13
- date: 2016-01-17 00:00:00.000000000 Z
13
+ date: 2016-01-27 00:00:00.000000000 Z
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
16
16
  name: elasticsearch-model
17
17
  requirement: !ruby/object:Gem::Requirement
18
18
  requirements:
19
- - - ">="
19
+ - - "~>"
20
20
  - !ruby/object:Gem::Version
21
- version: '0'
21
+ version: '1.0'
22
22
  type: :runtime
23
23
  prerelease: false
24
24
  version_requirements: !ruby/object:Gem::Requirement
25
25
  requirements:
26
- - - ">="
26
+ - - "~>"
27
27
  - !ruby/object:Gem::Version
28
- version: '0'
28
+ version: '1.0'
29
29
  - !ruby/object:Gem::Dependency
30
30
  name: elasticsearch-api
31
31
  requirement: !ruby/object:Gem::Requirement
32
32
  requirements:
33
33
  - - "~>"
34
34
  - !ruby/object:Gem::Version
35
- version: 1.0.15
35
+ version: '1.0'
36
36
  type: :runtime
37
37
  prerelease: false
38
38
  version_requirements: !ruby/object:Gem::Requirement
39
39
  requirements:
40
40
  - - "~>"
41
41
  - !ruby/object:Gem::Version
42
- version: 1.0.15
42
+ version: '1.0'
43
43
  - !ruby/object:Gem::Dependency
44
44
  name: rugged
45
45
  requirement: !ruby/object:Gem::Requirement
@@ -60,56 +60,56 @@ dependencies:
60
60
  requirements:
61
61
  - - "~>"
62
62
  - !ruby/object:Gem::Version
63
- version: 0.7.3
63
+ version: '0.7'
64
64
  type: :runtime
65
65
  prerelease: false
66
66
  version_requirements: !ruby/object:Gem::Requirement
67
67
  requirements:
68
68
  - - "~>"
69
69
  - !ruby/object:Gem::Version
70
- version: 0.7.3
70
+ version: '0.7'
71
71
  - !ruby/object:Gem::Dependency
72
72
  name: github-linguist
73
73
  requirement: !ruby/object:Gem::Requirement
74
74
  requirements:
75
75
  - - "~>"
76
76
  - !ruby/object:Gem::Version
77
- version: 4.7.0
77
+ version: '4.7'
78
78
  type: :runtime
79
79
  prerelease: false
80
80
  version_requirements: !ruby/object:Gem::Requirement
81
81
  requirements:
82
82
  - - "~>"
83
83
  - !ruby/object:Gem::Version
84
- version: 4.7.0
84
+ version: '4.7'
85
85
  - !ruby/object:Gem::Dependency
86
86
  name: activemodel
87
87
  requirement: !ruby/object:Gem::Requirement
88
88
  requirements:
89
89
  - - "~>"
90
90
  - !ruby/object:Gem::Version
91
- version: 4.2.0
91
+ version: '4.2'
92
92
  type: :runtime
93
93
  prerelease: false
94
94
  version_requirements: !ruby/object:Gem::Requirement
95
95
  requirements:
96
96
  - - "~>"
97
97
  - !ruby/object:Gem::Version
98
- version: 4.2.0
98
+ version: '4.2'
99
99
  - !ruby/object:Gem::Dependency
100
100
  name: activesupport
101
101
  requirement: !ruby/object:Gem::Requirement
102
102
  requirements:
103
103
  - - "~>"
104
104
  - !ruby/object:Gem::Version
105
- version: 4.2.0
105
+ version: '4.2'
106
106
  type: :runtime
107
107
  prerelease: false
108
108
  version_requirements: !ruby/object:Gem::Requirement
109
109
  requirements:
110
110
  - - "~>"
111
111
  - !ruby/object:Gem::Version
112
- version: 4.2.0
112
+ version: '4.2'
113
113
  description: Elasticsearch integrations for indexing git repositories.
114
114
  email:
115
115
  - me@zzet.org
@@ -157,7 +157,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
157
157
  version: '0'
158
158
  requirements: []
159
159
  rubyforge_project:
160
- rubygems_version: 2.4.3
160
+ rubygems_version: 2.4.8
161
161
  signing_key:
162
162
  specification_version: 4
163
163
  summary: Elasticsearch integrations for git repositories.