diarize-jruby 0.2.0-java
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +101 -0
- data/diarize-jruby.gemspec +16 -0
- data/lib/diarize.rb +22 -0
- data/lib/diarize/LIUM_SpkDiarization-4.2.jar +0 -0
- data/lib/diarize/audio.rb +194 -0
- data/lib/diarize/audio_player.rb +42 -0
- data/lib/diarize/lium.rb +22 -0
- data/lib/diarize/segment.rb +74 -0
- data/lib/diarize/segmentation.rb +55 -0
- data/lib/diarize/speaker.rb +187 -0
- data/lib/diarize/super_vector.rb +86 -0
- data/lib/diarize/ubm.gmm +0 -0
- metadata +95 -0
data/README.md
ADDED
@@ -0,0 +1,101 @@
|
|
1
|
+
diarize-jruby
|
2
|
+
=============
|
3
|
+
|
4
|
+
This library provides an easy-to-use toolkit for speaker
|
5
|
+
segmentation (diarization) and identification from audio.
|
6
|
+
|
7
|
+
This library is being used within the BBC R&D World Service
|
8
|
+
archive prototype.
|
9
|
+
|
10
|
+
See http://worldservice.prototyping.bbc.co.uk/programmes/X0403940 for
|
11
|
+
an example.
|
12
|
+
|
13
|
+
Speaker diarization
|
14
|
+
-------------------
|
15
|
+
|
16
|
+
This library gives acccess to the algorithm developed by the LIUM
|
17
|
+
for the ESTER 2 evaluation campaign and described in [Meigner2010].
|
18
|
+
|
19
|
+
It wraps a binary JAR file compiled from
|
20
|
+
http://lium3.univ-lemans.fr/diarization/doku.php/welcome.
|
21
|
+
|
22
|
+
|
23
|
+
Speaker identification
|
24
|
+
----------------------
|
25
|
+
|
26
|
+
This library also implements an algorithm for speaker identification
|
27
|
+
based on the comparison of normalised speaker models, which can be
|
28
|
+
accessed through the Speaker#match method.
|
29
|
+
|
30
|
+
This algorithm builds on top of the LIUM toolkit and uses the following
|
31
|
+
techniques:
|
32
|
+
|
33
|
+
* "M-Norm" normalisation of speaker models [Ben2003]
|
34
|
+
* The symmetric Kullback-Leibler divergence approximation described in [Do2003]
|
35
|
+
* The detection score specified in [Ben2005]
|
36
|
+
|
37
|
+
It also includes support for speaker supervectors [Campbell2006], which
|
38
|
+
can be used in combination with our ruby-lsh library for fast speaker
|
39
|
+
identification.
|
40
|
+
|
41
|
+
Example use
|
42
|
+
-----------
|
43
|
+
|
44
|
+
This gem has been tested with jruby 1.7.2 onwards.
|
45
|
+
|
46
|
+
$ jruby -S gem install diarize-jruby
|
47
|
+
$ jruby -S irb
|
48
|
+
> require 'diarize'
|
49
|
+
> audio = Diarize::Audio.new URI('http://example.com/file.wav')
|
50
|
+
> audio.analyze!
|
51
|
+
> audio.segments
|
52
|
+
> audio.speakers
|
53
|
+
> audio.to_rdf
|
54
|
+
> speakers = audio.speakers
|
55
|
+
> speakers.first.gender
|
56
|
+
> speakers.first.model.mean\_log\_likelihood
|
57
|
+
> speakers.first.model.components.size
|
58
|
+
> audio.segments\_by\_speaker(speakers.first)[0].play
|
59
|
+
> audio.segments\_by\_speaker(speakers.first)[1].play
|
60
|
+
> ...
|
61
|
+
> speakers |= other\_speakers
|
62
|
+
> Diarize::Speaker.match(speakers)
|
63
|
+
|
64
|
+
|
65
|
+
References
|
66
|
+
----------
|
67
|
+
|
68
|
+
[Meigner2010] S. Meignier and T. Merlin, "LIUM SpkDiarization:
|
69
|
+
An Open Source Toolkit For Diarization" in Proc. CMU SPUD Workshop,
|
70
|
+
March 2010, Dallas (Texas, USA)
|
71
|
+
|
72
|
+
[Ben2003] M. Ben and F. Bimbot, "D-MAP: A Distance-Normalized Map
|
73
|
+
Estimation of SPeaker Models for Automatic Speaker Verification",
|
74
|
+
Proceedings of ICASSP, 2003
|
75
|
+
|
76
|
+
[Do2003] M. N. Do, "Fast Approximation of Kullback-Leibler Distance
|
77
|
+
for Dependence Trees and Hidden Markov Models",
|
78
|
+
IEEE Signal Processing Letters, April 2003
|
79
|
+
|
80
|
+
[Ben2005] M. Ben and G. Gravier and F. Bimbot. "A model space
|
81
|
+
framework for efficient speaker detection",
|
82
|
+
Proceedings of INTERSPEECH, 2005
|
83
|
+
|
84
|
+
[Campbell2006] W. M. Campbell, D. E. Sturim and D. A. Reynolds,
|
85
|
+
"Support vector machines using GMM supervectors for speaker verification",
|
86
|
+
IEEE Signal Processing Letters, 2006, 13, 308-311
|
87
|
+
|
88
|
+
Licensing terms and authorship
|
89
|
+
------------------------------
|
90
|
+
|
91
|
+
See 'COPYING' and 'AUTHORS' files.
|
92
|
+
|
93
|
+
All code here, except where otherwise indicated, is licensed under
|
94
|
+
the GNU Affero General Public License version 3. This license includes
|
95
|
+
many restrictions. If this causes a problem, please contact us.
|
96
|
+
See "AUTHORS" for contact details.
|
97
|
+
|
98
|
+
This library includes a binary JAR file from the LIUM project, which code
|
99
|
+
is licensed under the GNU General Public License version 2. See
|
100
|
+
http://lium3.univ-lemans.fr/diarization/doku.php/licence for more
|
101
|
+
information.
|
@@ -0,0 +1,16 @@
|
|
1
|
+
Gem::Specification.new do |s|
|
2
|
+
s.name = "diarize-jruby"
|
3
|
+
s.version = "0.2.0"
|
4
|
+
s.date = "2013-06-14"
|
5
|
+
s.summary = "Speaker Diarization for JRuby"
|
6
|
+
s.email = "yves.raimond@bbc.co.uk"
|
7
|
+
s.homepage = "http://github.com/bbcrd/diarize"
|
8
|
+
s.description = "A library for JRuby wrapping the LIUM Speaker Diarization and including a few extra tools"
|
9
|
+
s.has_rdoc = false
|
10
|
+
s.authors = ['Yves Raimond']
|
11
|
+
s.files = ["README.md", "diarize-jruby.gemspec", "lib", "lib/diarize.rb", "lib/diarize/LIUM_SpkDiarization-4.2.jar", "lib/diarize/lium.rb", "lib/diarize/audio.rb", "lib/diarize/audio_player.rb", "lib/diarize/segmentation.rb", "lib/diarize/segment.rb", "lib/diarize/ubm.gmm", "lib/diarize/speaker.rb", "lib/diarize/super_vector.rb"]
|
12
|
+
s.platform = 'java'
|
13
|
+
s.add_dependency 'to-rdf'
|
14
|
+
s.add_dependency 'jblas-ruby'
|
15
|
+
end
|
16
|
+
|
data/lib/diarize.rb
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
# diarize-jruby
|
2
|
+
#
|
3
|
+
# Copyright (c) 2013 British Broadcasting Corporation
|
4
|
+
#
|
5
|
+
# Licensed under the GNU Affero General Public License version 3 (the "License");
|
6
|
+
# you may not use this file except in compliance with the License.
|
7
|
+
# You may obtain a copy of the License at
|
8
|
+
#
|
9
|
+
# http://www.gnu.org/licenses/agpl
|
10
|
+
#
|
11
|
+
# Unless required by applicable law or agreed to in writing, software
|
12
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
13
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
14
|
+
# See the License for the specific language governing permissions and
|
15
|
+
# limitations under the License.
|
16
|
+
|
17
|
+
require File.join(File.expand_path(File.dirname(__FILE__)), 'diarize', 'lium.rb')
|
18
|
+
require File.join(File.expand_path(File.dirname(__FILE__)), 'diarize', 'audio.rb')
|
19
|
+
require File.join(File.expand_path(File.dirname(__FILE__)), 'diarize', 'segment.rb')
|
20
|
+
require File.join(File.expand_path(File.dirname(__FILE__)), 'diarize', 'segmentation.rb')
|
21
|
+
require File.join(File.expand_path(File.dirname(__FILE__)), 'diarize', 'audio_player.rb')
|
22
|
+
require File.join(File.expand_path(File.dirname(__FILE__)), 'diarize', 'super_vector.rb')
|
Binary file
|
@@ -0,0 +1,194 @@
|
|
1
|
+
# diarize-jruby
|
2
|
+
#
|
3
|
+
# Copyright (c) 2013 British Broadcasting Corporation
|
4
|
+
#
|
5
|
+
# Licensed under the GNU Affero General Public License version 3 (the "License");
|
6
|
+
# you may not use this file except in compliance with the License.
|
7
|
+
# You may obtain a copy of the License at
|
8
|
+
#
|
9
|
+
# http://www.gnu.org/licenses/agpl
|
10
|
+
#
|
11
|
+
# Unless required by applicable law or agreed to in writing, software
|
12
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
13
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
14
|
+
# See the License for the specific language governing permissions and
|
15
|
+
# limitations under the License.
|
16
|
+
|
17
|
+
require File.join(File.expand_path(File.dirname(__FILE__)), 'lium')
|
18
|
+
require File.join(File.expand_path(File.dirname(__FILE__)), 'segmentation')
|
19
|
+
require File.join(File.expand_path(File.dirname(__FILE__)), 'speaker')
|
20
|
+
|
21
|
+
require 'rubygems'
|
22
|
+
require 'to_rdf'
|
23
|
+
require 'uri'
|
24
|
+
|
25
|
+
module Diarize
|
26
|
+
|
27
|
+
class Audio
|
28
|
+
|
29
|
+
attr_reader :path, :file
|
30
|
+
|
31
|
+
def initialize(uri)
|
32
|
+
@uri = uri
|
33
|
+
if uri.scheme == 'file'
|
34
|
+
# Local file
|
35
|
+
@path = uri.path
|
36
|
+
else
|
37
|
+
# Remote file, we get it locally
|
38
|
+
@path = '/tmp/' + URI.escape(uri.to_s, Regexp.new("[^#{URI::PATTERN::UNRESERVED}]"))
|
39
|
+
Kernel.system("wget #{uri} -O #{@path}")
|
40
|
+
end
|
41
|
+
@file = File.new @path
|
42
|
+
end
|
43
|
+
|
44
|
+
def analyze!(train_speaker_models = true)
|
45
|
+
parameter = fr.lium.spkDiarization.parameter.Parameter.new
|
46
|
+
parameter.show = show
|
47
|
+
# 12 MFCC + Energy
|
48
|
+
# 1: static coefficients are present in the file
|
49
|
+
# 1: energy coefficient is present in the file
|
50
|
+
# 0: delta coefficients are not present in the file
|
51
|
+
# 0: delta energy coefficient is not present in the file
|
52
|
+
# 0: delta delta coefficients are not present in the file
|
53
|
+
# 0: delta delta energy coefficient is not present in the file
|
54
|
+
# 13: total size of a feature vector in the mfcc file
|
55
|
+
# 0:0:0: no feature normalization
|
56
|
+
parameter.parameterInputFeature.setFeaturesDescription('audio2sphinx,1:1:0:0:0:0,13,0:0:0:0')
|
57
|
+
parameter.parameterDiarization.cEClustering = true # We use CE clustering by default
|
58
|
+
parameter.parameterInputFeature.setFeatureMask(@path)
|
59
|
+
@clusters = ester2(parameter)
|
60
|
+
@segments = Segmentation.from_clusters(self, @clusters)
|
61
|
+
train_speaker_gmms if train_speaker_models
|
62
|
+
end
|
63
|
+
|
64
|
+
def clean!
|
65
|
+
return if @uri.scheme == 'file' # Don't delete local file if initialised from local URI
|
66
|
+
File.delete(@path)
|
67
|
+
end
|
68
|
+
|
69
|
+
def segments
|
70
|
+
raise Exception.new('You need to run analyze! before being able to access the analysis results') unless @segments
|
71
|
+
@segments
|
72
|
+
end
|
73
|
+
|
74
|
+
def speakers
|
75
|
+
return @speakers if @speakers
|
76
|
+
@speakers = segments.map { |segment| segment.speaker }.uniq
|
77
|
+
end
|
78
|
+
|
79
|
+
def segments_by_speaker(speaker)
|
80
|
+
segments.select { |segment| segment.speaker == speaker }
|
81
|
+
end
|
82
|
+
|
83
|
+
def duration_by_speaker(speaker)
|
84
|
+
return unless speaker
|
85
|
+
segments = segments_by_speaker(speaker)
|
86
|
+
duration = 0.0
|
87
|
+
segments.each { |segment| duration += segment.duration }
|
88
|
+
duration
|
89
|
+
end
|
90
|
+
|
91
|
+
def top_speakers
|
92
|
+
speakers.sort {|s1, s2| duration_by_speaker(s1) <=> duration_by_speaker(s2)}.reverse
|
93
|
+
end
|
94
|
+
|
95
|
+
include ToRdf
|
96
|
+
|
97
|
+
def namespaces
|
98
|
+
super.merge 'ws' => 'http://wsarchive.prototype0.net/ontology/', 'mo' => 'http://purl.org/ontology/mo/'
|
99
|
+
end
|
100
|
+
|
101
|
+
def uri
|
102
|
+
@uri
|
103
|
+
end
|
104
|
+
|
105
|
+
def uri=(uri)
|
106
|
+
@uri = uri
|
107
|
+
end
|
108
|
+
|
109
|
+
def base_uri
|
110
|
+
# Remove the fragment if there is one
|
111
|
+
base = uri.clone
|
112
|
+
base.fragment = nil
|
113
|
+
base
|
114
|
+
end
|
115
|
+
|
116
|
+
def type_uri
|
117
|
+
@type_uri || 'mo:AudioFile'
|
118
|
+
end
|
119
|
+
|
120
|
+
def type_uri=(type_uri)
|
121
|
+
@type_uri = type_uri
|
122
|
+
end
|
123
|
+
|
124
|
+
def rdf_mapping
|
125
|
+
{ 'ws:segment' => segments }
|
126
|
+
end
|
127
|
+
|
128
|
+
def show
|
129
|
+
# The LIUM show name will be the file name, without extension or directory
|
130
|
+
File.expand_path(@path).split('/')[-1].split('.')[0]
|
131
|
+
end
|
132
|
+
|
133
|
+
protected
|
134
|
+
|
135
|
+
def train_speaker_gmms
|
136
|
+
segments # Making sure we have pre-computed segments and clusters
|
137
|
+
# Would be nice to reuse GMMs computed as part of the segmentation process
|
138
|
+
# but not sure how to access them without changing the Java API
|
139
|
+
|
140
|
+
# Start by copying models from the universal background model, one per speaker, using MTrainInit
|
141
|
+
parameter = fr.lium.spkDiarization.parameter.Parameter.new
|
142
|
+
parameter.parameterInputFeature.setFeaturesDescription('audio2sphinx,1:3:2:0:0:0,13,1:1:300:4')
|
143
|
+
parameter.parameterInputFeature.setFeatureMask(@path)
|
144
|
+
parameter.parameterInitializationEM.setModelInitMethod('copy')
|
145
|
+
parameter.parameterModelSetInputFile.setMask(File.join(File.expand_path(File.dirname(__FILE__)), 'ubm.gmm'))
|
146
|
+
features = fr.lium.spkDiarization.lib.MainTools.readFeatureSet(parameter, @clusters)
|
147
|
+
init_vect = java.util.ArrayList.new(@clusters.cluster_get_size)
|
148
|
+
fr.lium.spkDiarization.programs.MTrainInit.make(features, @clusters, init_vect, parameter)
|
149
|
+
|
150
|
+
# Adapt models to individual speakers detected in the audio, using MTrainMap
|
151
|
+
parameter = fr.lium.spkDiarization.parameter.Parameter.new
|
152
|
+
parameter.parameterInputFeature.setFeaturesDescription('audio2sphinx,1:3:2:0:0:0,13,1:1:300:4')
|
153
|
+
parameter.parameterInputFeature.setFeatureMask(@path)
|
154
|
+
parameter.parameterEM.setEMControl('1,5,0.01')
|
155
|
+
parameter.parameterVarianceControl.setVarianceControl('0.01,10.0')
|
156
|
+
parameter.show = show
|
157
|
+
features.setCurrentShow(parameter.show)
|
158
|
+
gmm_vect = java.util.ArrayList.new
|
159
|
+
fr.lium.spkDiarization.programs.MTrainMAP.make(features, @clusters, init_vect, gmm_vect, parameter)
|
160
|
+
|
161
|
+
# Populating the speakers with their GMMs
|
162
|
+
gmm_vect.each_with_index do |speaker_model, i|
|
163
|
+
speakers[i].model = speaker_model
|
164
|
+
end
|
165
|
+
end
|
166
|
+
|
167
|
+
def ester2(parameter)
|
168
|
+
diarization = fr.lium.spkDiarization.system.Diarization.new
|
169
|
+
parameterDiarization = parameter.parameterDiarization
|
170
|
+
clusterSet = diarization.initialize__method(parameter)
|
171
|
+
featureSet = fr.lium.spkDiarization.system.Diarization.load_feature(parameter, clusterSet, parameter.parameterInputFeature.getFeaturesDescString())
|
172
|
+
featureSet.setCurrentShow(parameter.show)
|
173
|
+
nbFeatures = featureSet.getNumberOfFeatures
|
174
|
+
clusterSet.getFirstCluster().firstSegment().setLength(nbFeatures) unless parameter.parameterDiarization.isLoadInputSegmentation
|
175
|
+
clustersSegInit = diarization.sanityCheck(clusterSet, featureSet, parameter)
|
176
|
+
clustersSeg = diarization.segmentation("GLR", "FULL", clustersSegInit, featureSet, parameter)
|
177
|
+
clustersLClust = diarization.clusteringLinear(parameterDiarization.getThreshold("l"), clustersSeg, featureSet, parameter)
|
178
|
+
clustersHClust = diarization.clustering(parameterDiarization.getThreshold("h"), clustersLClust, featureSet, parameter)
|
179
|
+
clustersDClust = diarization.decode(8, parameterDiarization.getThreshold("d"), clustersHClust, featureSet, parameter)
|
180
|
+
clustersSplitClust = diarization.speech("10,10,50", clusterSet, clustersSegInit, clustersDClust, featureSet, parameter)
|
181
|
+
clusters = diarization.gender(clusterSet, clustersSplitClust, featureSet, parameter)
|
182
|
+
if parameter.parameterDiarization.isCEClustering
|
183
|
+
# If true, the program computes the NCLR/CE clustering at the end.
|
184
|
+
# The diarization error rate is minimized.
|
185
|
+
# If this option is not set, the program stops right after the detection of the gender
|
186
|
+
# and the resulting segmentation is sufficient for a transcription system.
|
187
|
+
clusters = diarization.speakerClustering(parameterDiarization.getThreshold("c"), "ce", clusterSet, clusters, featureSet, parameter)
|
188
|
+
end
|
189
|
+
clusters
|
190
|
+
end
|
191
|
+
|
192
|
+
end
|
193
|
+
|
194
|
+
end
|
@@ -0,0 +1,42 @@
|
|
1
|
+
# diarize-jruby
|
2
|
+
#
|
3
|
+
# Copyright (c) 2013 British Broadcasting Corporation
|
4
|
+
#
|
5
|
+
# Licensed under the GNU Affero General Public License version 3 (the "License");
|
6
|
+
# you may not use this file except in compliance with the License.
|
7
|
+
# You may obtain a copy of the License at
|
8
|
+
#
|
9
|
+
# http://www.gnu.org/licenses/agpl
|
10
|
+
#
|
11
|
+
# Unless required by applicable law or agreed to in writing, software
|
12
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
13
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
14
|
+
# See the License for the specific language governing permissions and
|
15
|
+
# limitations under the License.
|
16
|
+
|
17
|
+
require 'java'
|
18
|
+
|
19
|
+
module Diarize
|
20
|
+
|
21
|
+
class AudioPlayer
|
22
|
+
|
23
|
+
def play(file, start=0.0, duration=10.0)
|
24
|
+
java_file = java.io.File.new(file.path)
|
25
|
+
stream = javax.sound.sampled.AudioSystem.getAudioInputStream(java_file)
|
26
|
+
clip = javax.sound.sampled.AudioSystem.clip
|
27
|
+
clip.open(stream)
|
28
|
+
clip.setMicrosecondPosition(start * 1000000)
|
29
|
+
clip.start
|
30
|
+
begin
|
31
|
+
sleep(duration)
|
32
|
+
rescue Exception
|
33
|
+
$stderr.puts 'Stopping playback'
|
34
|
+
end
|
35
|
+
clip.stop
|
36
|
+
clip.close
|
37
|
+
stream.close
|
38
|
+
end
|
39
|
+
|
40
|
+
end
|
41
|
+
|
42
|
+
end
|
data/lib/diarize/lium.rb
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
# diarize-jruby
|
2
|
+
#
|
3
|
+
# Copyright (c) 2013 British Broadcasting Corporation
|
4
|
+
#
|
5
|
+
# Licensed under the GNU Affero General Public License version 3 (the "License");
|
6
|
+
# you may not use this file except in compliance with the License.
|
7
|
+
# You may obtain a copy of the License at
|
8
|
+
#
|
9
|
+
# http://www.gnu.org/licenses/agpl
|
10
|
+
#
|
11
|
+
# Unless required by applicable law or agreed to in writing, software
|
12
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
13
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
14
|
+
# See the License for the specific language governing permissions and
|
15
|
+
# limitations under the License.
|
16
|
+
|
17
|
+
require 'java'
|
18
|
+
require File.join(File.expand_path(File.dirname(__FILE__)), 'LIUM_SpkDiarization-4.2.jar')
|
19
|
+
|
20
|
+
def fr
|
21
|
+
Java::Fr
|
22
|
+
end
|
@@ -0,0 +1,74 @@
|
|
1
|
+
# diarize-jruby
|
2
|
+
#
|
3
|
+
# Copyright (c) 2013 British Broadcasting Corporation
|
4
|
+
#
|
5
|
+
# Licensed under the GNU Affero General Public License version 3 (the "License");
|
6
|
+
# you may not use this file except in compliance with the License.
|
7
|
+
# You may obtain a copy of the License at
|
8
|
+
#
|
9
|
+
# http://www.gnu.org/licenses/agpl
|
10
|
+
#
|
11
|
+
# Unless required by applicable law or agreed to in writing, software
|
12
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
13
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
14
|
+
# See the License for the specific language governing permissions and
|
15
|
+
# limitations under the License.
|
16
|
+
|
17
|
+
require File.join(File.expand_path(File.dirname(__FILE__)), 'audio_player')
|
18
|
+
|
19
|
+
require 'rubygems'
|
20
|
+
require 'to_rdf'
|
21
|
+
require 'uri'
|
22
|
+
|
23
|
+
module Diarize
|
24
|
+
|
25
|
+
class Segment
|
26
|
+
|
27
|
+
attr_reader :start, :duration, :gender, :bandwidth
|
28
|
+
|
29
|
+
def initialize(audio, start, duration, gender, bandwidth, speaker_id)
|
30
|
+
@audio = audio
|
31
|
+
@start = start
|
32
|
+
@duration = duration
|
33
|
+
@bandwidth = bandwidth
|
34
|
+
@speaker_id = speaker_id
|
35
|
+
@speaker_gender = gender
|
36
|
+
end
|
37
|
+
|
38
|
+
def speaker
|
39
|
+
Speaker.find_or_create(URI("#{@audio.base_uri}##{@speaker_id}"), @speaker_gender)
|
40
|
+
end
|
41
|
+
|
42
|
+
def play
|
43
|
+
player = AudioPlayer.new
|
44
|
+
player.play(@audio.file, start, duration)
|
45
|
+
end
|
46
|
+
|
47
|
+
include ToRdf
|
48
|
+
|
49
|
+
def namespaces
|
50
|
+
super.merge 'ws' => 'http://wsarchive.prototype0.net/ontology/'
|
51
|
+
end
|
52
|
+
|
53
|
+
def uri
|
54
|
+
# http://www.w3.org/TR/media-frags/
|
55
|
+
URI("#{@audio.base_uri}#t=#{start},#{start+duration}")
|
56
|
+
end
|
57
|
+
|
58
|
+
def type_uri
|
59
|
+
'ws:Segment'
|
60
|
+
end
|
61
|
+
|
62
|
+
def rdf_mapping
|
63
|
+
{
|
64
|
+
'ws:start' => start,
|
65
|
+
'ws:duration' => duration,
|
66
|
+
'ws:gender' => gender,
|
67
|
+
'ws:bandwidth' => bandwidth,
|
68
|
+
'ws:speaker' => speaker,
|
69
|
+
}
|
70
|
+
end
|
71
|
+
|
72
|
+
end
|
73
|
+
|
74
|
+
end
|
@@ -0,0 +1,55 @@
|
|
1
|
+
# diarize-jruby
|
2
|
+
#
|
3
|
+
# Copyright (c) 2013 British Broadcasting Corporation
|
4
|
+
#
|
5
|
+
# Licensed under the GNU Affero General Public License version 3 (the "License");
|
6
|
+
# you may not use this file except in compliance with the License.
|
7
|
+
# You may obtain a copy of the License at
|
8
|
+
#
|
9
|
+
# http://www.gnu.org/licenses/agpl
|
10
|
+
#
|
11
|
+
# Unless required by applicable law or agreed to in writing, software
|
12
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
13
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
14
|
+
# See the License for the specific language governing permissions and
|
15
|
+
# limitations under the License.
|
16
|
+
|
17
|
+
require File.join(File.expand_path(File.dirname(__FILE__)), 'segment')
|
18
|
+
|
19
|
+
module Diarize
|
20
|
+
|
21
|
+
class Segmentation
|
22
|
+
|
23
|
+
def self.from_seg_file(audio, seg_file)
|
24
|
+
segmentation = []
|
25
|
+
File.open(seg_file).lines.each do |line|
|
26
|
+
next if line.start_with? ';;'
|
27
|
+
parts = line.split(' ')
|
28
|
+
start = parts[2].to_i / 100.0
|
29
|
+
duration = parts[3].to_i / 100.0
|
30
|
+
gender = parts[4]
|
31
|
+
bandwidth = parts[6]
|
32
|
+
speaker_id = parts[7]
|
33
|
+
segmentation << Segment.new(audio, start, duration, gender, bandwidth, speaker_id)
|
34
|
+
end
|
35
|
+
segmentation
|
36
|
+
end
|
37
|
+
|
38
|
+
def self.from_clusters(audio, clusters)
|
39
|
+
segmentation = []
|
40
|
+
clusters.each do |speaker_id|
|
41
|
+
cluster = clusters.get_cluster(speaker_id)
|
42
|
+
gender = cluster.gender
|
43
|
+
bandwidth = cluster.bandwidth
|
44
|
+
cluster.each do |segment|
|
45
|
+
start = segment.start_in_second
|
46
|
+
duration = segment.length_in_second
|
47
|
+
segmentation << Segment.new(audio, start, duration, gender, bandwidth, speaker_id)
|
48
|
+
end
|
49
|
+
end
|
50
|
+
segmentation
|
51
|
+
end
|
52
|
+
|
53
|
+
end
|
54
|
+
|
55
|
+
end
|
@@ -0,0 +1,187 @@
|
|
1
|
+
# diarize-jruby
|
2
|
+
#
|
3
|
+
# Copyright (c) 2013 British Broadcasting Corporation
|
4
|
+
#
|
5
|
+
# Licensed under the GNU Affero General Public License version 3 (the "License");
|
6
|
+
# you may not use this file except in compliance with the License.
|
7
|
+
# You may obtain a copy of the License at
|
8
|
+
#
|
9
|
+
# http://www.gnu.org/licenses/agpl
|
10
|
+
#
|
11
|
+
# Unless required by applicable law or agreed to in writing, software
|
12
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
13
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
14
|
+
# See the License for the specific language governing permissions and
|
15
|
+
# limitations under the License.
|
16
|
+
|
17
|
+
require 'rubygems'
|
18
|
+
require 'to_rdf'
|
19
|
+
require 'jblas'
|
20
|
+
|
21
|
+
module Diarize
|
22
|
+
|
23
|
+
class Speaker
|
24
|
+
|
25
|
+
include JBLAS
|
26
|
+
|
27
|
+
# Some possible matching heuristics if using GDMAP:
|
28
|
+
# - speaker mean_log_likelihood needs to be more than -33 to be considered for match
|
29
|
+
# - distance between two speakers need to be less than distance between speaker and universal model + detection threshold to be considered
|
30
|
+
|
31
|
+
@@log_likelihood_threshold = -33
|
32
|
+
@@detection_threshold = 0.2 # Need to learn that parameter
|
33
|
+
|
34
|
+
@@speakers = {}
|
35
|
+
|
36
|
+
attr_accessor :model_uri, :model, :normalized
|
37
|
+
attr_reader :gender
|
38
|
+
|
39
|
+
def initialize(uri = nil, gender = nil, model_file = nil)
|
40
|
+
@model = Speaker.load_model(model_file) if model_file
|
41
|
+
@uri = uri
|
42
|
+
@gender = gender
|
43
|
+
@normalized = false
|
44
|
+
end
|
45
|
+
|
46
|
+
def self.ubm
|
47
|
+
speaker = Speaker.new
|
48
|
+
speaker.normalized = true
|
49
|
+
speaker.model = Speaker.load_model(File.join(File.expand_path(File.dirname(__FILE__)), 'ubm.gmm'))
|
50
|
+
speaker
|
51
|
+
end
|
52
|
+
|
53
|
+
def mean_log_likelihood
|
54
|
+
@mean_log_likelihood ? @mean_log_likelihood : model.mean_log_likelihood # Will be NaN if model was loaded from somewhere
|
55
|
+
end
|
56
|
+
|
57
|
+
def mean_log_likelihood=(mll)
|
58
|
+
@mean_log_likelihood = mll
|
59
|
+
end
|
60
|
+
|
61
|
+
def save_model(filename)
|
62
|
+
# TODO perhaps a warning if a normalised model is being saved?
|
63
|
+
write_gmm(filename, @model)
|
64
|
+
end
|
65
|
+
|
66
|
+
def self.detection_threshold=(threshold)
|
67
|
+
@@detection_threshold = threshold
|
68
|
+
end
|
69
|
+
|
70
|
+
def self.detection_threshold
|
71
|
+
@@detection_threshold
|
72
|
+
end
|
73
|
+
|
74
|
+
def self.load_model(filename)
|
75
|
+
read_gmm(filename)
|
76
|
+
end
|
77
|
+
|
78
|
+
def self.find_or_create(uri, gender)
|
79
|
+
return @@speakers[uri] if @@speakers[uri]
|
80
|
+
@@speakers[uri] = Speaker.new(uri, gender)
|
81
|
+
end
|
82
|
+
|
83
|
+
def self.divergence(speaker1, speaker2)
|
84
|
+
# TODO bundle in mean_log_likelihood to weight down unlikely models?
|
85
|
+
return unless speaker1.model and speaker2.model
|
86
|
+
# MAP Gaussian divergence
|
87
|
+
# See "A model space framework for efficient speaker detection", Interspeech'05
|
88
|
+
divergence_lium(speaker1, speaker2)
|
89
|
+
end
|
90
|
+
|
91
|
+
def self.divergence_lium(speaker1, speaker2)
|
92
|
+
fr.lium.spkDiarization.libModel.Distance.GDMAP(speaker1.model, speaker2.model)
|
93
|
+
end
|
94
|
+
|
95
|
+
def self.divergence_ruby(speaker1, speaker2)
|
96
|
+
SuperVector.divergence(speaker1.supervector, speaker2.supervector)
|
97
|
+
end
|
98
|
+
|
99
|
+
def self.match_sets(speakers1, speakers2)
|
100
|
+
matches = []
|
101
|
+
speakers1.each do |s1|
|
102
|
+
speakers2.each do |s2|
|
103
|
+
matches << [ s1, s2 ] if s1.same_speaker_as(s2)
|
104
|
+
end
|
105
|
+
end
|
106
|
+
matches
|
107
|
+
end
|
108
|
+
|
109
|
+
def self.match(speakers)
|
110
|
+
speakers.combination(2).select { |s1, s2| s1.same_speaker_as(s2) }
|
111
|
+
end
|
112
|
+
|
113
|
+
def normalize!
|
114
|
+
unless @normalized
|
115
|
+
# Applies M-Norm from "D-MAP: a Distance-Normalized MAP Estimation of Speaker Models for Automatic Speaker Verification"
|
116
|
+
# to the associated GMM, placing it on a unit hyper-sphere with a UBM centre (model will be at distance one from the UBM
|
117
|
+
# according to GDMAP)
|
118
|
+
# Using supervectors: vector = (1.0 / distance_to_ubm) * vector + (1.0 - 1.0 / distance_to_ubm) * ubm_vector
|
119
|
+
speaker_ubm = Speaker.ubm
|
120
|
+
distance_to_ubm = Math.sqrt(Speaker.divergence(self, speaker_ubm))
|
121
|
+
model.nb_of_components.times do |k|
|
122
|
+
gaussian = model.components.get(k)
|
123
|
+
gaussian.dim.times do |i|
|
124
|
+
normalized_mean = (1.0 / distance_to_ubm) * gaussian.mean(i) + (1.0 - 1.0 / distance_to_ubm) * speaker_ubm.model.components.get(k).mean(i)
|
125
|
+
gaussian.set_mean(i, normalized_mean)
|
126
|
+
end
|
127
|
+
end
|
128
|
+
@normalized = true
|
129
|
+
end
|
130
|
+
@normalized
|
131
|
+
end
|
132
|
+
|
133
|
+
def same_speaker_as(other)
|
134
|
+
# Detection score defined in Ben2005
|
135
|
+
return unless [ self.mean_log_likelihood, other.mean_log_likelihood ].min > @@log_likelihood_threshold
|
136
|
+
self.normalize!
|
137
|
+
other.normalize!
|
138
|
+
detection_score = 1.0 - Speaker.divergence(other, self)
|
139
|
+
detection_score > @@detection_threshold
|
140
|
+
end
|
141
|
+
|
142
|
+
def supervector
|
143
|
+
# TODO: cache only when normalized
|
144
|
+
@supervector ||= SuperVector.generate_from_model(model)
|
145
|
+
end
|
146
|
+
|
147
|
+
include ToRdf
|
148
|
+
|
149
|
+
def namespaces
|
150
|
+
super.merge 'ws' => 'http://wsarchive.prototype0.net/ontology/'
|
151
|
+
end
|
152
|
+
|
153
|
+
def uri
|
154
|
+
@uri
|
155
|
+
end
|
156
|
+
|
157
|
+
def type_uri
|
158
|
+
'ws:Speaker'
|
159
|
+
end
|
160
|
+
|
161
|
+
def rdf_mapping
|
162
|
+
{ 'ws:gender' => gender, 'ws:model' => model_uri, 'ws:mean_log_likelihood' => model.mean_log_likelihood, 'ws:supervector_hash' => supervector.hash.to_s }
|
163
|
+
end
|
164
|
+
|
165
|
+
protected
|
166
|
+
|
167
|
+
def self.read_gmm(filename)
|
168
|
+
gmmlist = java.util.ArrayList.new
|
169
|
+
input = fr.lium.spkDiarization.lib.IOFile.new(filename, 'rb')
|
170
|
+
input.open
|
171
|
+
fr.lium.spkDiarization.libModel.ModelIO.readerGMMContainer(input, gmmlist)
|
172
|
+
input.close
|
173
|
+
gmmlist.to_a.first
|
174
|
+
end
|
175
|
+
|
176
|
+
def write_gmm(filename, model)
|
177
|
+
gmmlist = java.util.ArrayList.new
|
178
|
+
gmmlist << model
|
179
|
+
output = fr.lium.spkDiarization.lib.IOFile.new(filename, 'wb')
|
180
|
+
output.open
|
181
|
+
fr.lium.spkDiarization.libModel.ModelIO.writerGMMContainer(output, gmmlist)
|
182
|
+
output.close
|
183
|
+
end
|
184
|
+
|
185
|
+
end
|
186
|
+
|
187
|
+
end
|
@@ -0,0 +1,86 @@
|
|
1
|
+
# diarize-jruby
|
2
|
+
#
|
3
|
+
# Copyright (c) 2013 British Broadcasting Corporation
|
4
|
+
#
|
5
|
+
# Licensed under the GNU Affero General Public License version 3 (the "License");
|
6
|
+
# you may not use this file except in compliance with the License.
|
7
|
+
# You may obtain a copy of the License at
|
8
|
+
#
|
9
|
+
# http://www.gnu.org/licenses/agpl
|
10
|
+
#
|
11
|
+
# Unless required by applicable law or agreed to in writing, software
|
12
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
13
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
14
|
+
# See the License for the specific language governing permissions and
|
15
|
+
# limitations under the License.
|
16
|
+
|
17
|
+
module Diarize
|
18
|
+
|
19
|
+
class SuperVector
|
20
|
+
|
21
|
+
include JBLAS
|
22
|
+
|
23
|
+
attr_reader :vector
|
24
|
+
|
25
|
+
def initialize(vector)
|
26
|
+
@vector = vector
|
27
|
+
end
|
28
|
+
|
29
|
+
def self.generate_from_model(model)
|
30
|
+
# Generates a supervector from a LIUM GMM
|
31
|
+
dim = model.nb_of_components * model.components.get(0).dim
|
32
|
+
vector = DoubleMatrix.new(1, dim)
|
33
|
+
model.nb_of_components.times do |k|
|
34
|
+
gaussian = model.components.get(k)
|
35
|
+
gaussian.dim.times do |i|
|
36
|
+
vector[k * gaussian.dim + i] = gaussian.mean(i)
|
37
|
+
end
|
38
|
+
end
|
39
|
+
SuperVector.new(vector)
|
40
|
+
end
|
41
|
+
|
42
|
+
def self.ubm_gaussian_weights
|
43
|
+
# Returns a vector of gaussian weights, same dimension as speaker's super vectors
|
44
|
+
@@ubm_gaussian_weights ||= (
|
45
|
+
ubm = Speaker.ubm
|
46
|
+
weights = DoubleMatrix.new(1, ubm.supervector.dim)
|
47
|
+
ubm.model.nb_of_components.times do |k|
|
48
|
+
gaussian = ubm.model.components.get(k)
|
49
|
+
gaussian.dim.times do |i|
|
50
|
+
weights[k * gaussian.dim + i] = gaussian.weight
|
51
|
+
end
|
52
|
+
end
|
53
|
+
weights
|
54
|
+
)
|
55
|
+
end
|
56
|
+
|
57
|
+
def self.ubm_covariance
|
58
|
+
# Returns a vector of diagonal covariances, same dimension as speaker's super vectors
|
59
|
+
@@ubm_covariance ||= (
|
60
|
+
ubm = Speaker.ubm
|
61
|
+
cov = DoubleMatrix.new(1, ubm.supervector.dim)
|
62
|
+
ubm.model.nb_of_components.times do |k|
|
63
|
+
gaussian = ubm.model.components.get(k)
|
64
|
+
gaussian.dim.times do |i|
|
65
|
+
cov[k * gaussian.dim + i] = gaussian.getCovariance(i, i)
|
66
|
+
end
|
67
|
+
end
|
68
|
+
cov
|
69
|
+
)
|
70
|
+
end
|
71
|
+
|
72
|
+
def self.divergence(sv1, sv2)
|
73
|
+
ubm_gaussian_weights.mul(((sv1.vector - sv2.vector) ** 2) / ubm_covariance).sum
|
74
|
+
end
|
75
|
+
|
76
|
+
def dim
|
77
|
+
@vector.columns
|
78
|
+
end
|
79
|
+
|
80
|
+
def hash
|
81
|
+
@vector.hash
|
82
|
+
end
|
83
|
+
|
84
|
+
end
|
85
|
+
|
86
|
+
end
|
data/lib/diarize/ubm.gmm
ADDED
Binary file
|
metadata
ADDED
@@ -0,0 +1,95 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: diarize-jruby
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.2.0
|
5
|
+
prerelease:
|
6
|
+
platform: java
|
7
|
+
authors:
|
8
|
+
- Yves Raimond
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2013-06-14 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: to-rdf
|
16
|
+
version_requirements: !ruby/object:Gem::Requirement
|
17
|
+
requirements:
|
18
|
+
- - ">="
|
19
|
+
- !ruby/object:Gem::Version
|
20
|
+
version: !binary |-
|
21
|
+
MA==
|
22
|
+
none: false
|
23
|
+
requirement: !ruby/object:Gem::Requirement
|
24
|
+
requirements:
|
25
|
+
- - ">="
|
26
|
+
- !ruby/object:Gem::Version
|
27
|
+
version: !binary |-
|
28
|
+
MA==
|
29
|
+
none: false
|
30
|
+
prerelease: false
|
31
|
+
type: :runtime
|
32
|
+
- !ruby/object:Gem::Dependency
|
33
|
+
name: jblas-ruby
|
34
|
+
version_requirements: !ruby/object:Gem::Requirement
|
35
|
+
requirements:
|
36
|
+
- - ">="
|
37
|
+
- !ruby/object:Gem::Version
|
38
|
+
version: !binary |-
|
39
|
+
MA==
|
40
|
+
none: false
|
41
|
+
requirement: !ruby/object:Gem::Requirement
|
42
|
+
requirements:
|
43
|
+
- - ">="
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: !binary |-
|
46
|
+
MA==
|
47
|
+
none: false
|
48
|
+
prerelease: false
|
49
|
+
type: :runtime
|
50
|
+
description: A library for JRuby wrapping the LIUM Speaker Diarization and including a few extra tools
|
51
|
+
email: yves.raimond@bbc.co.uk
|
52
|
+
executables: []
|
53
|
+
extensions: []
|
54
|
+
extra_rdoc_files: []
|
55
|
+
files:
|
56
|
+
- README.md
|
57
|
+
- diarize-jruby.gemspec
|
58
|
+
- lib/diarize.rb
|
59
|
+
- lib/diarize/LIUM_SpkDiarization-4.2.jar
|
60
|
+
- lib/diarize/lium.rb
|
61
|
+
- lib/diarize/audio.rb
|
62
|
+
- lib/diarize/audio_player.rb
|
63
|
+
- lib/diarize/segmentation.rb
|
64
|
+
- lib/diarize/segment.rb
|
65
|
+
- lib/diarize/ubm.gmm
|
66
|
+
- lib/diarize/speaker.rb
|
67
|
+
- lib/diarize/super_vector.rb
|
68
|
+
homepage: http://github.com/bbcrd/diarize
|
69
|
+
licenses: []
|
70
|
+
post_install_message:
|
71
|
+
rdoc_options: []
|
72
|
+
require_paths:
|
73
|
+
- lib
|
74
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
75
|
+
requirements:
|
76
|
+
- - ">="
|
77
|
+
- !ruby/object:Gem::Version
|
78
|
+
version: !binary |-
|
79
|
+
MA==
|
80
|
+
none: false
|
81
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
82
|
+
requirements:
|
83
|
+
- - ">="
|
84
|
+
- !ruby/object:Gem::Version
|
85
|
+
version: !binary |-
|
86
|
+
MA==
|
87
|
+
none: false
|
88
|
+
requirements: []
|
89
|
+
rubyforge_project:
|
90
|
+
rubygems_version: 1.8.24
|
91
|
+
signing_key:
|
92
|
+
specification_version: 3
|
93
|
+
summary: Speaker Diarization for JRuby
|
94
|
+
test_files: []
|
95
|
+
has_rdoc: false
|