diarize-jruby 0.2.0-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +101 -0
- data/diarize-jruby.gemspec +16 -0
- data/lib/diarize.rb +22 -0
- data/lib/diarize/LIUM_SpkDiarization-4.2.jar +0 -0
- data/lib/diarize/audio.rb +194 -0
- data/lib/diarize/audio_player.rb +42 -0
- data/lib/diarize/lium.rb +22 -0
- data/lib/diarize/segment.rb +74 -0
- data/lib/diarize/segmentation.rb +55 -0
- data/lib/diarize/speaker.rb +187 -0
- data/lib/diarize/super_vector.rb +86 -0
- data/lib/diarize/ubm.gmm +0 -0
- metadata +95 -0
data/README.md
ADDED
@@ -0,0 +1,101 @@
|
|
1
|
+
diarize-jruby
|
2
|
+
=============
|
3
|
+
|
4
|
+
This library provides an easy-to-use toolkit for speaker
|
5
|
+
segmentation (diarization) and identification from audio.
|
6
|
+
|
7
|
+
This library is being used within the BBC R&D World Service
|
8
|
+
archive prototype.
|
9
|
+
|
10
|
+
See http://worldservice.prototyping.bbc.co.uk/programmes/X0403940 for
|
11
|
+
an example.
|
12
|
+
|
13
|
+
Speaker diarization
|
14
|
+
-------------------
|
15
|
+
|
16
|
+
This library gives acccess to the algorithm developed by the LIUM
|
17
|
+
for the ESTER 2 evaluation campaign and described in [Meigner2010].
|
18
|
+
|
19
|
+
It wraps a binary JAR file compiled from
|
20
|
+
http://lium3.univ-lemans.fr/diarization/doku.php/welcome.
|
21
|
+
|
22
|
+
|
23
|
+
Speaker identification
|
24
|
+
----------------------
|
25
|
+
|
26
|
+
This library also implements an algorithm for speaker identification
|
27
|
+
based on the comparison of normalised speaker models, which can be
|
28
|
+
accessed through the Speaker#match method.
|
29
|
+
|
30
|
+
This algorithm builds on top of the LIUM toolkit and uses the following
|
31
|
+
techniques:
|
32
|
+
|
33
|
+
* "M-Norm" normalisation of speaker models [Ben2003]
|
34
|
+
* The symmetric Kullback-Leibler divergence approximation described in [Do2003]
|
35
|
+
* The detection score specified in [Ben2005]
|
36
|
+
|
37
|
+
It also includes support for speaker supervectors [Campbell2006], which
|
38
|
+
can be used in combination with our ruby-lsh library for fast speaker
|
39
|
+
identification.
|
40
|
+
|
41
|
+
Example use
|
42
|
+
-----------
|
43
|
+
|
44
|
+
This gem has been tested with jruby 1.7.2 onwards.
|
45
|
+
|
46
|
+
$ jruby -S gem install diarize-jruby
|
47
|
+
$ jruby -S irb
|
48
|
+
> require 'diarize'
|
49
|
+
> audio = Diarize::Audio.new URI('http://example.com/file.wav')
|
50
|
+
> audio.analyze!
|
51
|
+
> audio.segments
|
52
|
+
> audio.speakers
|
53
|
+
> audio.to_rdf
|
54
|
+
> speakers = audio.speakers
|
55
|
+
> speakers.first.gender
|
56
|
+
> speakers.first.model.mean\_log\_likelihood
|
57
|
+
> speakers.first.model.components.size
|
58
|
+
> audio.segments\_by\_speaker(speakers.first)[0].play
|
59
|
+
> audio.segments\_by\_speaker(speakers.first)[1].play
|
60
|
+
> ...
|
61
|
+
> speakers |= other\_speakers
|
62
|
+
> Diarize::Speaker.match(speakers)
|
63
|
+
|
64
|
+
|
65
|
+
References
|
66
|
+
----------
|
67
|
+
|
68
|
+
[Meigner2010] S. Meignier and T. Merlin, "LIUM SpkDiarization:
|
69
|
+
An Open Source Toolkit For Diarization" in Proc. CMU SPUD Workshop,
|
70
|
+
March 2010, Dallas (Texas, USA)
|
71
|
+
|
72
|
+
[Ben2003] M. Ben and F. Bimbot, "D-MAP: A Distance-Normalized Map
|
73
|
+
Estimation of SPeaker Models for Automatic Speaker Verification",
|
74
|
+
Proceedings of ICASSP, 2003
|
75
|
+
|
76
|
+
[Do2003] M. N. Do, "Fast Approximation of Kullback-Leibler Distance
|
77
|
+
for Dependence Trees and Hidden Markov Models",
|
78
|
+
IEEE Signal Processing Letters, April 2003
|
79
|
+
|
80
|
+
[Ben2005] M. Ben and G. Gravier and F. Bimbot. "A model space
|
81
|
+
framework for efficient speaker detection",
|
82
|
+
Proceedings of INTERSPEECH, 2005
|
83
|
+
|
84
|
+
[Campbell2006] W. M. Campbell, D. E. Sturim and D. A. Reynolds,
|
85
|
+
"Support vector machines using GMM supervectors for speaker verification",
|
86
|
+
IEEE Signal Processing Letters, 2006, 13, 308-311
|
87
|
+
|
88
|
+
Licensing terms and authorship
|
89
|
+
------------------------------
|
90
|
+
|
91
|
+
See 'COPYING' and 'AUTHORS' files.
|
92
|
+
|
93
|
+
All code here, except where otherwise indicated, is licensed under
|
94
|
+
the GNU Affero General Public License version 3. This license includes
|
95
|
+
many restrictions. If this causes a problem, please contact us.
|
96
|
+
See "AUTHORS" for contact details.
|
97
|
+
|
98
|
+
This library includes a binary JAR file from the LIUM project, which code
|
99
|
+
is licensed under the GNU General Public License version 2. See
|
100
|
+
http://lium3.univ-lemans.fr/diarization/doku.php/licence for more
|
101
|
+
information.
|
@@ -0,0 +1,16 @@
|
|
1
|
+
Gem::Specification.new do |s|
|
2
|
+
s.name = "diarize-jruby"
|
3
|
+
s.version = "0.2.0"
|
4
|
+
s.date = "2013-06-14"
|
5
|
+
s.summary = "Speaker Diarization for JRuby"
|
6
|
+
s.email = "yves.raimond@bbc.co.uk"
|
7
|
+
s.homepage = "http://github.com/bbcrd/diarize"
|
8
|
+
s.description = "A library for JRuby wrapping the LIUM Speaker Diarization and including a few extra tools"
|
9
|
+
s.has_rdoc = false
|
10
|
+
s.authors = ['Yves Raimond']
|
11
|
+
s.files = ["README.md", "diarize-jruby.gemspec", "lib", "lib/diarize.rb", "lib/diarize/LIUM_SpkDiarization-4.2.jar", "lib/diarize/lium.rb", "lib/diarize/audio.rb", "lib/diarize/audio_player.rb", "lib/diarize/segmentation.rb", "lib/diarize/segment.rb", "lib/diarize/ubm.gmm", "lib/diarize/speaker.rb", "lib/diarize/super_vector.rb"]
|
12
|
+
s.platform = 'java'
|
13
|
+
s.add_dependency 'to-rdf'
|
14
|
+
s.add_dependency 'jblas-ruby'
|
15
|
+
end
|
16
|
+
|
data/lib/diarize.rb
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
# diarize-jruby
|
2
|
+
#
|
3
|
+
# Copyright (c) 2013 British Broadcasting Corporation
|
4
|
+
#
|
5
|
+
# Licensed under the GNU Affero General Public License version 3 (the "License");
|
6
|
+
# you may not use this file except in compliance with the License.
|
7
|
+
# You may obtain a copy of the License at
|
8
|
+
#
|
9
|
+
# http://www.gnu.org/licenses/agpl
|
10
|
+
#
|
11
|
+
# Unless required by applicable law or agreed to in writing, software
|
12
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
13
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
14
|
+
# See the License for the specific language governing permissions and
|
15
|
+
# limitations under the License.
|
16
|
+
|
17
|
+
require File.join(File.expand_path(File.dirname(__FILE__)), 'diarize', 'lium.rb')
|
18
|
+
require File.join(File.expand_path(File.dirname(__FILE__)), 'diarize', 'audio.rb')
|
19
|
+
require File.join(File.expand_path(File.dirname(__FILE__)), 'diarize', 'segment.rb')
|
20
|
+
require File.join(File.expand_path(File.dirname(__FILE__)), 'diarize', 'segmentation.rb')
|
21
|
+
require File.join(File.expand_path(File.dirname(__FILE__)), 'diarize', 'audio_player.rb')
|
22
|
+
require File.join(File.expand_path(File.dirname(__FILE__)), 'diarize', 'super_vector.rb')
|
Binary file
|
@@ -0,0 +1,194 @@
|
|
1
|
+
# diarize-jruby
|
2
|
+
#
|
3
|
+
# Copyright (c) 2013 British Broadcasting Corporation
|
4
|
+
#
|
5
|
+
# Licensed under the GNU Affero General Public License version 3 (the "License");
|
6
|
+
# you may not use this file except in compliance with the License.
|
7
|
+
# You may obtain a copy of the License at
|
8
|
+
#
|
9
|
+
# http://www.gnu.org/licenses/agpl
|
10
|
+
#
|
11
|
+
# Unless required by applicable law or agreed to in writing, software
|
12
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
13
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
14
|
+
# See the License for the specific language governing permissions and
|
15
|
+
# limitations under the License.
|
16
|
+
|
17
|
+
require File.join(File.expand_path(File.dirname(__FILE__)), 'lium')
|
18
|
+
require File.join(File.expand_path(File.dirname(__FILE__)), 'segmentation')
|
19
|
+
require File.join(File.expand_path(File.dirname(__FILE__)), 'speaker')
|
20
|
+
|
21
|
+
require 'rubygems'
|
22
|
+
require 'to_rdf'
|
23
|
+
require 'uri'
|
24
|
+
|
25
|
+
module Diarize
|
26
|
+
|
27
|
+
class Audio
|
28
|
+
|
29
|
+
attr_reader :path, :file
|
30
|
+
|
31
|
+
def initialize(uri)
|
32
|
+
@uri = uri
|
33
|
+
if uri.scheme == 'file'
|
34
|
+
# Local file
|
35
|
+
@path = uri.path
|
36
|
+
else
|
37
|
+
# Remote file, we get it locally
|
38
|
+
@path = '/tmp/' + URI.escape(uri.to_s, Regexp.new("[^#{URI::PATTERN::UNRESERVED}]"))
|
39
|
+
Kernel.system("wget #{uri} -O #{@path}")
|
40
|
+
end
|
41
|
+
@file = File.new @path
|
42
|
+
end
|
43
|
+
|
44
|
+
def analyze!(train_speaker_models = true)
|
45
|
+
parameter = fr.lium.spkDiarization.parameter.Parameter.new
|
46
|
+
parameter.show = show
|
47
|
+
# 12 MFCC + Energy
|
48
|
+
# 1: static coefficients are present in the file
|
49
|
+
# 1: energy coefficient is present in the file
|
50
|
+
# 0: delta coefficients are not present in the file
|
51
|
+
# 0: delta energy coefficient is not present in the file
|
52
|
+
# 0: delta delta coefficients are not present in the file
|
53
|
+
# 0: delta delta energy coefficient is not present in the file
|
54
|
+
# 13: total size of a feature vector in the mfcc file
|
55
|
+
# 0:0:0: no feature normalization
|
56
|
+
parameter.parameterInputFeature.setFeaturesDescription('audio2sphinx,1:1:0:0:0:0,13,0:0:0:0')
|
57
|
+
parameter.parameterDiarization.cEClustering = true # We use CE clustering by default
|
58
|
+
parameter.parameterInputFeature.setFeatureMask(@path)
|
59
|
+
@clusters = ester2(parameter)
|
60
|
+
@segments = Segmentation.from_clusters(self, @clusters)
|
61
|
+
train_speaker_gmms if train_speaker_models
|
62
|
+
end
|
63
|
+
|
64
|
+
def clean!
|
65
|
+
return if @uri.scheme == 'file' # Don't delete local file if initialised from local URI
|
66
|
+
File.delete(@path)
|
67
|
+
end
|
68
|
+
|
69
|
+
def segments
|
70
|
+
raise Exception.new('You need to run analyze! before being able to access the analysis results') unless @segments
|
71
|
+
@segments
|
72
|
+
end
|
73
|
+
|
74
|
+
def speakers
|
75
|
+
return @speakers if @speakers
|
76
|
+
@speakers = segments.map { |segment| segment.speaker }.uniq
|
77
|
+
end
|
78
|
+
|
79
|
+
def segments_by_speaker(speaker)
|
80
|
+
segments.select { |segment| segment.speaker == speaker }
|
81
|
+
end
|
82
|
+
|
83
|
+
def duration_by_speaker(speaker)
|
84
|
+
return unless speaker
|
85
|
+
segments = segments_by_speaker(speaker)
|
86
|
+
duration = 0.0
|
87
|
+
segments.each { |segment| duration += segment.duration }
|
88
|
+
duration
|
89
|
+
end
|
90
|
+
|
91
|
+
def top_speakers
|
92
|
+
speakers.sort {|s1, s2| duration_by_speaker(s1) <=> duration_by_speaker(s2)}.reverse
|
93
|
+
end
|
94
|
+
|
95
|
+
include ToRdf
|
96
|
+
|
97
|
+
def namespaces
|
98
|
+
super.merge 'ws' => 'http://wsarchive.prototype0.net/ontology/', 'mo' => 'http://purl.org/ontology/mo/'
|
99
|
+
end
|
100
|
+
|
101
|
+
def uri
|
102
|
+
@uri
|
103
|
+
end
|
104
|
+
|
105
|
+
def uri=(uri)
|
106
|
+
@uri = uri
|
107
|
+
end
|
108
|
+
|
109
|
+
def base_uri
|
110
|
+
# Remove the fragment if there is one
|
111
|
+
base = uri.clone
|
112
|
+
base.fragment = nil
|
113
|
+
base
|
114
|
+
end
|
115
|
+
|
116
|
+
def type_uri
|
117
|
+
@type_uri || 'mo:AudioFile'
|
118
|
+
end
|
119
|
+
|
120
|
+
def type_uri=(type_uri)
|
121
|
+
@type_uri = type_uri
|
122
|
+
end
|
123
|
+
|
124
|
+
def rdf_mapping
|
125
|
+
{ 'ws:segment' => segments }
|
126
|
+
end
|
127
|
+
|
128
|
+
def show
|
129
|
+
# The LIUM show name will be the file name, without extension or directory
|
130
|
+
File.expand_path(@path).split('/')[-1].split('.')[0]
|
131
|
+
end
|
132
|
+
|
133
|
+
protected
|
134
|
+
|
135
|
+
def train_speaker_gmms
|
136
|
+
segments # Making sure we have pre-computed segments and clusters
|
137
|
+
# Would be nice to reuse GMMs computed as part of the segmentation process
|
138
|
+
# but not sure how to access them without changing the Java API
|
139
|
+
|
140
|
+
# Start by copying models from the universal background model, one per speaker, using MTrainInit
|
141
|
+
parameter = fr.lium.spkDiarization.parameter.Parameter.new
|
142
|
+
parameter.parameterInputFeature.setFeaturesDescription('audio2sphinx,1:3:2:0:0:0,13,1:1:300:4')
|
143
|
+
parameter.parameterInputFeature.setFeatureMask(@path)
|
144
|
+
parameter.parameterInitializationEM.setModelInitMethod('copy')
|
145
|
+
parameter.parameterModelSetInputFile.setMask(File.join(File.expand_path(File.dirname(__FILE__)), 'ubm.gmm'))
|
146
|
+
features = fr.lium.spkDiarization.lib.MainTools.readFeatureSet(parameter, @clusters)
|
147
|
+
init_vect = java.util.ArrayList.new(@clusters.cluster_get_size)
|
148
|
+
fr.lium.spkDiarization.programs.MTrainInit.make(features, @clusters, init_vect, parameter)
|
149
|
+
|
150
|
+
# Adapt models to individual speakers detected in the audio, using MTrainMap
|
151
|
+
parameter = fr.lium.spkDiarization.parameter.Parameter.new
|
152
|
+
parameter.parameterInputFeature.setFeaturesDescription('audio2sphinx,1:3:2:0:0:0,13,1:1:300:4')
|
153
|
+
parameter.parameterInputFeature.setFeatureMask(@path)
|
154
|
+
parameter.parameterEM.setEMControl('1,5,0.01')
|
155
|
+
parameter.parameterVarianceControl.setVarianceControl('0.01,10.0')
|
156
|
+
parameter.show = show
|
157
|
+
features.setCurrentShow(parameter.show)
|
158
|
+
gmm_vect = java.util.ArrayList.new
|
159
|
+
fr.lium.spkDiarization.programs.MTrainMAP.make(features, @clusters, init_vect, gmm_vect, parameter)
|
160
|
+
|
161
|
+
# Populating the speakers with their GMMs
|
162
|
+
gmm_vect.each_with_index do |speaker_model, i|
|
163
|
+
speakers[i].model = speaker_model
|
164
|
+
end
|
165
|
+
end
|
166
|
+
|
167
|
+
def ester2(parameter)
|
168
|
+
diarization = fr.lium.spkDiarization.system.Diarization.new
|
169
|
+
parameterDiarization = parameter.parameterDiarization
|
170
|
+
clusterSet = diarization.initialize__method(parameter)
|
171
|
+
featureSet = fr.lium.spkDiarization.system.Diarization.load_feature(parameter, clusterSet, parameter.parameterInputFeature.getFeaturesDescString())
|
172
|
+
featureSet.setCurrentShow(parameter.show)
|
173
|
+
nbFeatures = featureSet.getNumberOfFeatures
|
174
|
+
clusterSet.getFirstCluster().firstSegment().setLength(nbFeatures) unless parameter.parameterDiarization.isLoadInputSegmentation
|
175
|
+
clustersSegInit = diarization.sanityCheck(clusterSet, featureSet, parameter)
|
176
|
+
clustersSeg = diarization.segmentation("GLR", "FULL", clustersSegInit, featureSet, parameter)
|
177
|
+
clustersLClust = diarization.clusteringLinear(parameterDiarization.getThreshold("l"), clustersSeg, featureSet, parameter)
|
178
|
+
clustersHClust = diarization.clustering(parameterDiarization.getThreshold("h"), clustersLClust, featureSet, parameter)
|
179
|
+
clustersDClust = diarization.decode(8, parameterDiarization.getThreshold("d"), clustersHClust, featureSet, parameter)
|
180
|
+
clustersSplitClust = diarization.speech("10,10,50", clusterSet, clustersSegInit, clustersDClust, featureSet, parameter)
|
181
|
+
clusters = diarization.gender(clusterSet, clustersSplitClust, featureSet, parameter)
|
182
|
+
if parameter.parameterDiarization.isCEClustering
|
183
|
+
# If true, the program computes the NCLR/CE clustering at the end.
|
184
|
+
# The diarization error rate is minimized.
|
185
|
+
# If this option is not set, the program stops right after the detection of the gender
|
186
|
+
# and the resulting segmentation is sufficient for a transcription system.
|
187
|
+
clusters = diarization.speakerClustering(parameterDiarization.getThreshold("c"), "ce", clusterSet, clusters, featureSet, parameter)
|
188
|
+
end
|
189
|
+
clusters
|
190
|
+
end
|
191
|
+
|
192
|
+
end
|
193
|
+
|
194
|
+
end
|
@@ -0,0 +1,42 @@
|
|
1
|
+
# diarize-jruby
|
2
|
+
#
|
3
|
+
# Copyright (c) 2013 British Broadcasting Corporation
|
4
|
+
#
|
5
|
+
# Licensed under the GNU Affero General Public License version 3 (the "License");
|
6
|
+
# you may not use this file except in compliance with the License.
|
7
|
+
# You may obtain a copy of the License at
|
8
|
+
#
|
9
|
+
# http://www.gnu.org/licenses/agpl
|
10
|
+
#
|
11
|
+
# Unless required by applicable law or agreed to in writing, software
|
12
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
13
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
14
|
+
# See the License for the specific language governing permissions and
|
15
|
+
# limitations under the License.
|
16
|
+
|
17
|
+
require 'java'
|
18
|
+
|
19
|
+
module Diarize
|
20
|
+
|
21
|
+
class AudioPlayer
|
22
|
+
|
23
|
+
def play(file, start=0.0, duration=10.0)
|
24
|
+
java_file = java.io.File.new(file.path)
|
25
|
+
stream = javax.sound.sampled.AudioSystem.getAudioInputStream(java_file)
|
26
|
+
clip = javax.sound.sampled.AudioSystem.clip
|
27
|
+
clip.open(stream)
|
28
|
+
clip.setMicrosecondPosition(start * 1000000)
|
29
|
+
clip.start
|
30
|
+
begin
|
31
|
+
sleep(duration)
|
32
|
+
rescue Exception
|
33
|
+
$stderr.puts 'Stopping playback'
|
34
|
+
end
|
35
|
+
clip.stop
|
36
|
+
clip.close
|
37
|
+
stream.close
|
38
|
+
end
|
39
|
+
|
40
|
+
end
|
41
|
+
|
42
|
+
end
|
data/lib/diarize/lium.rb
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
# diarize-jruby
|
2
|
+
#
|
3
|
+
# Copyright (c) 2013 British Broadcasting Corporation
|
4
|
+
#
|
5
|
+
# Licensed under the GNU Affero General Public License version 3 (the "License");
|
6
|
+
# you may not use this file except in compliance with the License.
|
7
|
+
# You may obtain a copy of the License at
|
8
|
+
#
|
9
|
+
# http://www.gnu.org/licenses/agpl
|
10
|
+
#
|
11
|
+
# Unless required by applicable law or agreed to in writing, software
|
12
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
13
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
14
|
+
# See the License for the specific language governing permissions and
|
15
|
+
# limitations under the License.
|
16
|
+
|
17
|
+
require 'java'
|
18
|
+
require File.join(File.expand_path(File.dirname(__FILE__)), 'LIUM_SpkDiarization-4.2.jar')
|
19
|
+
|
20
|
+
def fr
|
21
|
+
Java::Fr
|
22
|
+
end
|
@@ -0,0 +1,74 @@
|
|
1
|
+
# diarize-jruby
|
2
|
+
#
|
3
|
+
# Copyright (c) 2013 British Broadcasting Corporation
|
4
|
+
#
|
5
|
+
# Licensed under the GNU Affero General Public License version 3 (the "License");
|
6
|
+
# you may not use this file except in compliance with the License.
|
7
|
+
# You may obtain a copy of the License at
|
8
|
+
#
|
9
|
+
# http://www.gnu.org/licenses/agpl
|
10
|
+
#
|
11
|
+
# Unless required by applicable law or agreed to in writing, software
|
12
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
13
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
14
|
+
# See the License for the specific language governing permissions and
|
15
|
+
# limitations under the License.
|
16
|
+
|
17
|
+
require File.join(File.expand_path(File.dirname(__FILE__)), 'audio_player')
|
18
|
+
|
19
|
+
require 'rubygems'
|
20
|
+
require 'to_rdf'
|
21
|
+
require 'uri'
|
22
|
+
|
23
|
+
module Diarize
|
24
|
+
|
25
|
+
class Segment
|
26
|
+
|
27
|
+
attr_reader :start, :duration, :gender, :bandwidth
|
28
|
+
|
29
|
+
def initialize(audio, start, duration, gender, bandwidth, speaker_id)
|
30
|
+
@audio = audio
|
31
|
+
@start = start
|
32
|
+
@duration = duration
|
33
|
+
@bandwidth = bandwidth
|
34
|
+
@speaker_id = speaker_id
|
35
|
+
@speaker_gender = gender
|
36
|
+
end
|
37
|
+
|
38
|
+
def speaker
|
39
|
+
Speaker.find_or_create(URI("#{@audio.base_uri}##{@speaker_id}"), @speaker_gender)
|
40
|
+
end
|
41
|
+
|
42
|
+
def play
|
43
|
+
player = AudioPlayer.new
|
44
|
+
player.play(@audio.file, start, duration)
|
45
|
+
end
|
46
|
+
|
47
|
+
include ToRdf
|
48
|
+
|
49
|
+
def namespaces
|
50
|
+
super.merge 'ws' => 'http://wsarchive.prototype0.net/ontology/'
|
51
|
+
end
|
52
|
+
|
53
|
+
def uri
|
54
|
+
# http://www.w3.org/TR/media-frags/
|
55
|
+
URI("#{@audio.base_uri}#t=#{start},#{start+duration}")
|
56
|
+
end
|
57
|
+
|
58
|
+
def type_uri
|
59
|
+
'ws:Segment'
|
60
|
+
end
|
61
|
+
|
62
|
+
def rdf_mapping
|
63
|
+
{
|
64
|
+
'ws:start' => start,
|
65
|
+
'ws:duration' => duration,
|
66
|
+
'ws:gender' => gender,
|
67
|
+
'ws:bandwidth' => bandwidth,
|
68
|
+
'ws:speaker' => speaker,
|
69
|
+
}
|
70
|
+
end
|
71
|
+
|
72
|
+
end
|
73
|
+
|
74
|
+
end
|
@@ -0,0 +1,55 @@
|
|
1
|
+
# diarize-jruby
|
2
|
+
#
|
3
|
+
# Copyright (c) 2013 British Broadcasting Corporation
|
4
|
+
#
|
5
|
+
# Licensed under the GNU Affero General Public License version 3 (the "License");
|
6
|
+
# you may not use this file except in compliance with the License.
|
7
|
+
# You may obtain a copy of the License at
|
8
|
+
#
|
9
|
+
# http://www.gnu.org/licenses/agpl
|
10
|
+
#
|
11
|
+
# Unless required by applicable law or agreed to in writing, software
|
12
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
13
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
14
|
+
# See the License for the specific language governing permissions and
|
15
|
+
# limitations under the License.
|
16
|
+
|
17
|
+
require File.join(File.expand_path(File.dirname(__FILE__)), 'segment')
|
18
|
+
|
19
|
+
module Diarize
|
20
|
+
|
21
|
+
class Segmentation
|
22
|
+
|
23
|
+
def self.from_seg_file(audio, seg_file)
|
24
|
+
segmentation = []
|
25
|
+
File.open(seg_file).lines.each do |line|
|
26
|
+
next if line.start_with? ';;'
|
27
|
+
parts = line.split(' ')
|
28
|
+
start = parts[2].to_i / 100.0
|
29
|
+
duration = parts[3].to_i / 100.0
|
30
|
+
gender = parts[4]
|
31
|
+
bandwidth = parts[6]
|
32
|
+
speaker_id = parts[7]
|
33
|
+
segmentation << Segment.new(audio, start, duration, gender, bandwidth, speaker_id)
|
34
|
+
end
|
35
|
+
segmentation
|
36
|
+
end
|
37
|
+
|
38
|
+
def self.from_clusters(audio, clusters)
|
39
|
+
segmentation = []
|
40
|
+
clusters.each do |speaker_id|
|
41
|
+
cluster = clusters.get_cluster(speaker_id)
|
42
|
+
gender = cluster.gender
|
43
|
+
bandwidth = cluster.bandwidth
|
44
|
+
cluster.each do |segment|
|
45
|
+
start = segment.start_in_second
|
46
|
+
duration = segment.length_in_second
|
47
|
+
segmentation << Segment.new(audio, start, duration, gender, bandwidth, speaker_id)
|
48
|
+
end
|
49
|
+
end
|
50
|
+
segmentation
|
51
|
+
end
|
52
|
+
|
53
|
+
end
|
54
|
+
|
55
|
+
end
|
@@ -0,0 +1,187 @@
|
|
1
|
+
# diarize-jruby
|
2
|
+
#
|
3
|
+
# Copyright (c) 2013 British Broadcasting Corporation
|
4
|
+
#
|
5
|
+
# Licensed under the GNU Affero General Public License version 3 (the "License");
|
6
|
+
# you may not use this file except in compliance with the License.
|
7
|
+
# You may obtain a copy of the License at
|
8
|
+
#
|
9
|
+
# http://www.gnu.org/licenses/agpl
|
10
|
+
#
|
11
|
+
# Unless required by applicable law or agreed to in writing, software
|
12
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
13
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
14
|
+
# See the License for the specific language governing permissions and
|
15
|
+
# limitations under the License.
|
16
|
+
|
17
|
+
require 'rubygems'
|
18
|
+
require 'to_rdf'
|
19
|
+
require 'jblas'
|
20
|
+
|
21
|
+
module Diarize
|
22
|
+
|
23
|
+
class Speaker
|
24
|
+
|
25
|
+
include JBLAS
|
26
|
+
|
27
|
+
# Some possible matching heuristics if using GDMAP:
|
28
|
+
# - speaker mean_log_likelihood needs to be more than -33 to be considered for match
|
29
|
+
# - distance between two speakers need to be less than distance between speaker and universal model + detection threshold to be considered
|
30
|
+
|
31
|
+
@@log_likelihood_threshold = -33
|
32
|
+
@@detection_threshold = 0.2 # Need to learn that parameter
|
33
|
+
|
34
|
+
@@speakers = {}
|
35
|
+
|
36
|
+
attr_accessor :model_uri, :model, :normalized
|
37
|
+
attr_reader :gender
|
38
|
+
|
39
|
+
def initialize(uri = nil, gender = nil, model_file = nil)
|
40
|
+
@model = Speaker.load_model(model_file) if model_file
|
41
|
+
@uri = uri
|
42
|
+
@gender = gender
|
43
|
+
@normalized = false
|
44
|
+
end
|
45
|
+
|
46
|
+
def self.ubm
|
47
|
+
speaker = Speaker.new
|
48
|
+
speaker.normalized = true
|
49
|
+
speaker.model = Speaker.load_model(File.join(File.expand_path(File.dirname(__FILE__)), 'ubm.gmm'))
|
50
|
+
speaker
|
51
|
+
end
|
52
|
+
|
53
|
+
def mean_log_likelihood
|
54
|
+
@mean_log_likelihood ? @mean_log_likelihood : model.mean_log_likelihood # Will be NaN if model was loaded from somewhere
|
55
|
+
end
|
56
|
+
|
57
|
+
def mean_log_likelihood=(mll)
|
58
|
+
@mean_log_likelihood = mll
|
59
|
+
end
|
60
|
+
|
61
|
+
def save_model(filename)
|
62
|
+
# TODO perhaps a warning if a normalised model is being saved?
|
63
|
+
write_gmm(filename, @model)
|
64
|
+
end
|
65
|
+
|
66
|
+
def self.detection_threshold=(threshold)
|
67
|
+
@@detection_threshold = threshold
|
68
|
+
end
|
69
|
+
|
70
|
+
def self.detection_threshold
|
71
|
+
@@detection_threshold
|
72
|
+
end
|
73
|
+
|
74
|
+
def self.load_model(filename)
|
75
|
+
read_gmm(filename)
|
76
|
+
end
|
77
|
+
|
78
|
+
def self.find_or_create(uri, gender)
|
79
|
+
return @@speakers[uri] if @@speakers[uri]
|
80
|
+
@@speakers[uri] = Speaker.new(uri, gender)
|
81
|
+
end
|
82
|
+
|
83
|
+
def self.divergence(speaker1, speaker2)
|
84
|
+
# TODO bundle in mean_log_likelihood to weight down unlikely models?
|
85
|
+
return unless speaker1.model and speaker2.model
|
86
|
+
# MAP Gaussian divergence
|
87
|
+
# See "A model space framework for efficient speaker detection", Interspeech'05
|
88
|
+
divergence_lium(speaker1, speaker2)
|
89
|
+
end
|
90
|
+
|
91
|
+
def self.divergence_lium(speaker1, speaker2)
|
92
|
+
fr.lium.spkDiarization.libModel.Distance.GDMAP(speaker1.model, speaker2.model)
|
93
|
+
end
|
94
|
+
|
95
|
+
def self.divergence_ruby(speaker1, speaker2)
|
96
|
+
SuperVector.divergence(speaker1.supervector, speaker2.supervector)
|
97
|
+
end
|
98
|
+
|
99
|
+
def self.match_sets(speakers1, speakers2)
|
100
|
+
matches = []
|
101
|
+
speakers1.each do |s1|
|
102
|
+
speakers2.each do |s2|
|
103
|
+
matches << [ s1, s2 ] if s1.same_speaker_as(s2)
|
104
|
+
end
|
105
|
+
end
|
106
|
+
matches
|
107
|
+
end
|
108
|
+
|
109
|
+
def self.match(speakers)
|
110
|
+
speakers.combination(2).select { |s1, s2| s1.same_speaker_as(s2) }
|
111
|
+
end
|
112
|
+
|
113
|
+
def normalize!
|
114
|
+
unless @normalized
|
115
|
+
# Applies M-Norm from "D-MAP: a Distance-Normalized MAP Estimation of Speaker Models for Automatic Speaker Verification"
|
116
|
+
# to the associated GMM, placing it on a unit hyper-sphere with a UBM centre (model will be at distance one from the UBM
|
117
|
+
# according to GDMAP)
|
118
|
+
# Using supervectors: vector = (1.0 / distance_to_ubm) * vector + (1.0 - 1.0 / distance_to_ubm) * ubm_vector
|
119
|
+
speaker_ubm = Speaker.ubm
|
120
|
+
distance_to_ubm = Math.sqrt(Speaker.divergence(self, speaker_ubm))
|
121
|
+
model.nb_of_components.times do |k|
|
122
|
+
gaussian = model.components.get(k)
|
123
|
+
gaussian.dim.times do |i|
|
124
|
+
normalized_mean = (1.0 / distance_to_ubm) * gaussian.mean(i) + (1.0 - 1.0 / distance_to_ubm) * speaker_ubm.model.components.get(k).mean(i)
|
125
|
+
gaussian.set_mean(i, normalized_mean)
|
126
|
+
end
|
127
|
+
end
|
128
|
+
@normalized = true
|
129
|
+
end
|
130
|
+
@normalized
|
131
|
+
end
|
132
|
+
|
133
|
+
def same_speaker_as(other)
|
134
|
+
# Detection score defined in Ben2005
|
135
|
+
return unless [ self.mean_log_likelihood, other.mean_log_likelihood ].min > @@log_likelihood_threshold
|
136
|
+
self.normalize!
|
137
|
+
other.normalize!
|
138
|
+
detection_score = 1.0 - Speaker.divergence(other, self)
|
139
|
+
detection_score > @@detection_threshold
|
140
|
+
end
|
141
|
+
|
142
|
+
def supervector
|
143
|
+
# TODO: cache only when normalized
|
144
|
+
@supervector ||= SuperVector.generate_from_model(model)
|
145
|
+
end
|
146
|
+
|
147
|
+
include ToRdf
|
148
|
+
|
149
|
+
def namespaces
|
150
|
+
super.merge 'ws' => 'http://wsarchive.prototype0.net/ontology/'
|
151
|
+
end
|
152
|
+
|
153
|
+
def uri
|
154
|
+
@uri
|
155
|
+
end
|
156
|
+
|
157
|
+
def type_uri
|
158
|
+
'ws:Speaker'
|
159
|
+
end
|
160
|
+
|
161
|
+
def rdf_mapping
|
162
|
+
{ 'ws:gender' => gender, 'ws:model' => model_uri, 'ws:mean_log_likelihood' => model.mean_log_likelihood, 'ws:supervector_hash' => supervector.hash.to_s }
|
163
|
+
end
|
164
|
+
|
165
|
+
protected
|
166
|
+
|
167
|
+
def self.read_gmm(filename)
|
168
|
+
gmmlist = java.util.ArrayList.new
|
169
|
+
input = fr.lium.spkDiarization.lib.IOFile.new(filename, 'rb')
|
170
|
+
input.open
|
171
|
+
fr.lium.spkDiarization.libModel.ModelIO.readerGMMContainer(input, gmmlist)
|
172
|
+
input.close
|
173
|
+
gmmlist.to_a.first
|
174
|
+
end
|
175
|
+
|
176
|
+
def write_gmm(filename, model)
|
177
|
+
gmmlist = java.util.ArrayList.new
|
178
|
+
gmmlist << model
|
179
|
+
output = fr.lium.spkDiarization.lib.IOFile.new(filename, 'wb')
|
180
|
+
output.open
|
181
|
+
fr.lium.spkDiarization.libModel.ModelIO.writerGMMContainer(output, gmmlist)
|
182
|
+
output.close
|
183
|
+
end
|
184
|
+
|
185
|
+
end
|
186
|
+
|
187
|
+
end
|
@@ -0,0 +1,86 @@
|
|
1
|
+
# diarize-jruby
|
2
|
+
#
|
3
|
+
# Copyright (c) 2013 British Broadcasting Corporation
|
4
|
+
#
|
5
|
+
# Licensed under the GNU Affero General Public License version 3 (the "License");
|
6
|
+
# you may not use this file except in compliance with the License.
|
7
|
+
# You may obtain a copy of the License at
|
8
|
+
#
|
9
|
+
# http://www.gnu.org/licenses/agpl
|
10
|
+
#
|
11
|
+
# Unless required by applicable law or agreed to in writing, software
|
12
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
13
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
14
|
+
# See the License for the specific language governing permissions and
|
15
|
+
# limitations under the License.
|
16
|
+
|
17
|
+
module Diarize
|
18
|
+
|
19
|
+
class SuperVector
|
20
|
+
|
21
|
+
include JBLAS
|
22
|
+
|
23
|
+
attr_reader :vector
|
24
|
+
|
25
|
+
def initialize(vector)
|
26
|
+
@vector = vector
|
27
|
+
end
|
28
|
+
|
29
|
+
def self.generate_from_model(model)
|
30
|
+
# Generates a supervector from a LIUM GMM
|
31
|
+
dim = model.nb_of_components * model.components.get(0).dim
|
32
|
+
vector = DoubleMatrix.new(1, dim)
|
33
|
+
model.nb_of_components.times do |k|
|
34
|
+
gaussian = model.components.get(k)
|
35
|
+
gaussian.dim.times do |i|
|
36
|
+
vector[k * gaussian.dim + i] = gaussian.mean(i)
|
37
|
+
end
|
38
|
+
end
|
39
|
+
SuperVector.new(vector)
|
40
|
+
end
|
41
|
+
|
42
|
+
def self.ubm_gaussian_weights
|
43
|
+
# Returns a vector of gaussian weights, same dimension as speaker's super vectors
|
44
|
+
@@ubm_gaussian_weights ||= (
|
45
|
+
ubm = Speaker.ubm
|
46
|
+
weights = DoubleMatrix.new(1, ubm.supervector.dim)
|
47
|
+
ubm.model.nb_of_components.times do |k|
|
48
|
+
gaussian = ubm.model.components.get(k)
|
49
|
+
gaussian.dim.times do |i|
|
50
|
+
weights[k * gaussian.dim + i] = gaussian.weight
|
51
|
+
end
|
52
|
+
end
|
53
|
+
weights
|
54
|
+
)
|
55
|
+
end
|
56
|
+
|
57
|
+
def self.ubm_covariance
|
58
|
+
# Returns a vector of diagonal covariances, same dimension as speaker's super vectors
|
59
|
+
@@ubm_covariance ||= (
|
60
|
+
ubm = Speaker.ubm
|
61
|
+
cov = DoubleMatrix.new(1, ubm.supervector.dim)
|
62
|
+
ubm.model.nb_of_components.times do |k|
|
63
|
+
gaussian = ubm.model.components.get(k)
|
64
|
+
gaussian.dim.times do |i|
|
65
|
+
cov[k * gaussian.dim + i] = gaussian.getCovariance(i, i)
|
66
|
+
end
|
67
|
+
end
|
68
|
+
cov
|
69
|
+
)
|
70
|
+
end
|
71
|
+
|
72
|
+
def self.divergence(sv1, sv2)
|
73
|
+
ubm_gaussian_weights.mul(((sv1.vector - sv2.vector) ** 2) / ubm_covariance).sum
|
74
|
+
end
|
75
|
+
|
76
|
+
def dim
|
77
|
+
@vector.columns
|
78
|
+
end
|
79
|
+
|
80
|
+
def hash
|
81
|
+
@vector.hash
|
82
|
+
end
|
83
|
+
|
84
|
+
end
|
85
|
+
|
86
|
+
end
|
data/lib/diarize/ubm.gmm
ADDED
Binary file
|
metadata
ADDED
@@ -0,0 +1,95 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: diarize-jruby
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.2.0
|
5
|
+
prerelease:
|
6
|
+
platform: java
|
7
|
+
authors:
|
8
|
+
- Yves Raimond
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2013-06-14 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: to-rdf
|
16
|
+
version_requirements: !ruby/object:Gem::Requirement
|
17
|
+
requirements:
|
18
|
+
- - ">="
|
19
|
+
- !ruby/object:Gem::Version
|
20
|
+
version: !binary |-
|
21
|
+
MA==
|
22
|
+
none: false
|
23
|
+
requirement: !ruby/object:Gem::Requirement
|
24
|
+
requirements:
|
25
|
+
- - ">="
|
26
|
+
- !ruby/object:Gem::Version
|
27
|
+
version: !binary |-
|
28
|
+
MA==
|
29
|
+
none: false
|
30
|
+
prerelease: false
|
31
|
+
type: :runtime
|
32
|
+
- !ruby/object:Gem::Dependency
|
33
|
+
name: jblas-ruby
|
34
|
+
version_requirements: !ruby/object:Gem::Requirement
|
35
|
+
requirements:
|
36
|
+
- - ">="
|
37
|
+
- !ruby/object:Gem::Version
|
38
|
+
version: !binary |-
|
39
|
+
MA==
|
40
|
+
none: false
|
41
|
+
requirement: !ruby/object:Gem::Requirement
|
42
|
+
requirements:
|
43
|
+
- - ">="
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: !binary |-
|
46
|
+
MA==
|
47
|
+
none: false
|
48
|
+
prerelease: false
|
49
|
+
type: :runtime
|
50
|
+
description: A library for JRuby wrapping the LIUM Speaker Diarization and including a few extra tools
|
51
|
+
email: yves.raimond@bbc.co.uk
|
52
|
+
executables: []
|
53
|
+
extensions: []
|
54
|
+
extra_rdoc_files: []
|
55
|
+
files:
|
56
|
+
- README.md
|
57
|
+
- diarize-jruby.gemspec
|
58
|
+
- lib/diarize.rb
|
59
|
+
- lib/diarize/LIUM_SpkDiarization-4.2.jar
|
60
|
+
- lib/diarize/lium.rb
|
61
|
+
- lib/diarize/audio.rb
|
62
|
+
- lib/diarize/audio_player.rb
|
63
|
+
- lib/diarize/segmentation.rb
|
64
|
+
- lib/diarize/segment.rb
|
65
|
+
- lib/diarize/ubm.gmm
|
66
|
+
- lib/diarize/speaker.rb
|
67
|
+
- lib/diarize/super_vector.rb
|
68
|
+
homepage: http://github.com/bbcrd/diarize
|
69
|
+
licenses: []
|
70
|
+
post_install_message:
|
71
|
+
rdoc_options: []
|
72
|
+
require_paths:
|
73
|
+
- lib
|
74
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
75
|
+
requirements:
|
76
|
+
- - ">="
|
77
|
+
- !ruby/object:Gem::Version
|
78
|
+
version: !binary |-
|
79
|
+
MA==
|
80
|
+
none: false
|
81
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
82
|
+
requirements:
|
83
|
+
- - ">="
|
84
|
+
- !ruby/object:Gem::Version
|
85
|
+
version: !binary |-
|
86
|
+
MA==
|
87
|
+
none: false
|
88
|
+
requirements: []
|
89
|
+
rubyforge_project:
|
90
|
+
rubygems_version: 1.8.24
|
91
|
+
signing_key:
|
92
|
+
specification_version: 3
|
93
|
+
summary: Speaker Diarization for JRuby
|
94
|
+
test_files: []
|
95
|
+
has_rdoc: false
|