word_aligner 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/Gemfile +18 -0
- data/Gemfile.lock +115 -0
- data/Guardfile +8 -0
- data/LICENSE.txt +20 -0
- data/README.md +45 -0
- data/Rakefile +45 -0
- data/VERSION +1 -0
- data/lib/word_aligner.rb +11 -0
- data/lib/word_aligner/aligner.rb +170 -0
- data/lib/word_aligner/word_error_rate.rb +44 -0
- data/spec/lib/word_aligner/aligner_spec.rb +31 -0
- data/spec/lib/word_aligner/word_error_rate_spec.rb +28 -0
- data/spec/lib/word_aligner_spec.rb +15 -0
- data/spec/sample_data/grab_for_comparision.rb +51 -0
- data/spec/sample_data/regression/sentences.yml +647 -0
- data/spec/sample_data/source_data/enough.hypotheses.txt +1 -0
- data/spec/sample_data/source_data/enough.txt +1 -0
- data/spec/sample_data/source_data/exactly_data.hypotheses.txt +1 -0
- data/spec/sample_data/source_data/exactly_data.txt +1 -0
- data/spec/sample_data/source_data/hamlet.hypotheses.txt +2 -0
- data/spec/sample_data/source_data/hamlet.txt +2 -0
- data/spec/sample_data/source_data/that_might.hypotheses.txt +1 -0
- data/spec/sample_data/source_data/that_might.txt +1 -0
- data/spec/sample_data/word_align.pl +302 -0
- data/spec/spec_helper.rb +20 -0
- metadata +143 -0
@@ -0,0 +1,44 @@
|
|
1
|
+
require 'ostruct'
|
2
|
+
|
3
|
+
module WordAligner
|
4
|
+
class WordErrorRate < OpenStruct
|
5
|
+
|
6
|
+
attr_reader :data
|
7
|
+
|
8
|
+
def initialize(data)
|
9
|
+
@data = data
|
10
|
+
super(data)
|
11
|
+
end
|
12
|
+
|
13
|
+
def words
|
14
|
+
transcription_words
|
15
|
+
end
|
16
|
+
|
17
|
+
def correct_words
|
18
|
+
matching
|
19
|
+
end
|
20
|
+
|
21
|
+
def incorrect_words
|
22
|
+
align_cost
|
23
|
+
end
|
24
|
+
|
25
|
+
def percentage_accurate
|
26
|
+
100-percentage_incorrect
|
27
|
+
end
|
28
|
+
|
29
|
+
def percentage_correct
|
30
|
+
percent_rate(correct_words)
|
31
|
+
end
|
32
|
+
|
33
|
+
def percentage_incorrect
|
34
|
+
percent_rate(incorrect_words)
|
35
|
+
end
|
36
|
+
|
37
|
+
private
|
38
|
+
|
39
|
+
def percent_rate(value)
|
40
|
+
value * 100.0 / [ words, 1].max
|
41
|
+
end
|
42
|
+
|
43
|
+
end
|
44
|
+
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
class Sample < OpenStruct
|
4
|
+
def aligner_result
|
5
|
+
@aligner_result ||= WordAligner::Aligner.new(transcription, hypothesis)
|
6
|
+
.word_error_rate.data
|
7
|
+
end
|
8
|
+
end
|
9
|
+
|
10
|
+
module WordAligner
|
11
|
+
describe Aligner do
|
12
|
+
|
13
|
+
samples = YAML.load File.read('spec/sample_data/regression/sentences.yml')
|
14
|
+
|
15
|
+
samples.each_with_index do |sample, idx|
|
16
|
+
|
17
|
+
describe "sample #{idx}" do
|
18
|
+
subject { Sample.new(sample) }
|
19
|
+
|
20
|
+
its(:hypothesis) { should_not be_nil }
|
21
|
+
its(:hypothesis) { should_not be_empty }
|
22
|
+
|
23
|
+
its(:transcription) { should_not be_nil }
|
24
|
+
its(:transcription) { should_not be_empty }
|
25
|
+
|
26
|
+
its(:aligner_result) { should eq sample }
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
module WordAligner
|
4
|
+
describe WordErrorRate do
|
5
|
+
|
6
|
+
let(:data) do
|
7
|
+
{
|
8
|
+
insertions: 8,
|
9
|
+
substitutions: 2,
|
10
|
+
deletions: 0,
|
11
|
+
align_cost: 2,
|
12
|
+
transcription_words: 8,
|
13
|
+
matching: 6
|
14
|
+
}
|
15
|
+
end
|
16
|
+
|
17
|
+
subject { WordErrorRate.new(data) }
|
18
|
+
|
19
|
+
it { should be_a(WordErrorRate) }
|
20
|
+
|
21
|
+
its(:words) { should eq(8) }
|
22
|
+
its(:correct_words) { should eq(6) }
|
23
|
+
its(:incorrect_words) { should eq(2) }
|
24
|
+
its(:percentage_correct) { should eq(75.0) }
|
25
|
+
its(:percentage_incorrect) { should eq(25.0) }
|
26
|
+
its(:percentage_accurate) { should eq(75.0) }
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,51 @@
|
|
1
|
+
# USAGE: ruby grab_for_comparision.rb regression/sentences.txt.wa \
|
2
|
+
# regression/sentences.hypotheses.txt.wa > regression/sentences.yml
|
3
|
+
require 'yaml'
|
4
|
+
|
5
|
+
transcription_file, hypothesis_file = ARGV
|
6
|
+
command = "perl word_align.pl #{transcription_file} #{hypothesis_file}"
|
7
|
+
output = `#{command}`.split("\n").map(&:strip)
|
8
|
+
|
9
|
+
transcription_lines = File.readlines(transcription_file)
|
10
|
+
hypothesis_lines = File.readlines(hypothesis_file)
|
11
|
+
|
12
|
+
def strip_id(str)
|
13
|
+
str.sub(/\s*\(.+?\)\s*$/, '')
|
14
|
+
end
|
15
|
+
|
16
|
+
records = []
|
17
|
+
|
18
|
+
loop do
|
19
|
+
transcription = output.shift
|
20
|
+
hypothesis = output.shift
|
21
|
+
statistics = output.shift
|
22
|
+
distance = output.shift
|
23
|
+
|
24
|
+
break unless statistics.match(/Words:/)
|
25
|
+
|
26
|
+
words, correct, errors,
|
27
|
+
percentage_correct, error, accuracy = statistics.scan(/\d+(?:[.]\d+)?/)
|
28
|
+
insertions, deletions, substitutions = distance.scan(/\d+/)
|
29
|
+
|
30
|
+
transcription_line = strip_id(transcription_lines.shift.strip)
|
31
|
+
hypothesis_line = strip_id(hypothesis_lines.shift.strip)
|
32
|
+
|
33
|
+
next if transcription_line.empty?
|
34
|
+
|
35
|
+
details = {
|
36
|
+
transcription: transcription_line,
|
37
|
+
hypothesis: hypothesis_line,
|
38
|
+
insertions: insertions.to_i,
|
39
|
+
deletions: deletions.to_i,
|
40
|
+
substitutions: substitutions.to_i,
|
41
|
+
matching: correct.to_i,
|
42
|
+
align_cost: errors.to_i,
|
43
|
+
transcription_words: words.to_i,
|
44
|
+
aligned_transcription: strip_id(transcription),
|
45
|
+
aligned_hypothesis: strip_id(hypothesis)
|
46
|
+
}
|
47
|
+
|
48
|
+
records << details
|
49
|
+
end
|
50
|
+
|
51
|
+
puts records.to_yaml
|
@@ -0,0 +1,647 @@
|
|
1
|
+
---
|
2
|
+
- :transcription: I think this might just work for fine
|
3
|
+
:hypothesis: I'd think this might just work for laine
|
4
|
+
:insertions: 0
|
5
|
+
:deletions: 0
|
6
|
+
:substitutions: 2
|
7
|
+
:matching: 6
|
8
|
+
:align_cost: 2
|
9
|
+
:transcription_words: 8
|
10
|
+
:aligned_transcription: I think this might just work for FINE
|
11
|
+
:aligned_hypothesis: I'D think this might just work for LAINE
|
12
|
+
- :transcription: did you register for a new account
|
13
|
+
:hypothesis: hit you register for a new account
|
14
|
+
:insertions: 0
|
15
|
+
:deletions: 0
|
16
|
+
:substitutions: 1
|
17
|
+
:matching: 6
|
18
|
+
:align_cost: 1
|
19
|
+
:transcription_words: 7
|
20
|
+
:aligned_transcription: DID you register for a new account
|
21
|
+
:aligned_hypothesis: HIT you register for a new account
|
22
|
+
- :transcription: I almost forgot to tell you about our next meeting
|
23
|
+
:hypothesis: I almost forgot to tell you about our next meeting
|
24
|
+
:insertions: 0
|
25
|
+
:deletions: 0
|
26
|
+
:substitutions: 0
|
27
|
+
:matching: 10
|
28
|
+
:align_cost: 0
|
29
|
+
:transcription_words: 10
|
30
|
+
:aligned_transcription: i almost forgot to tell you about our next meeting
|
31
|
+
:aligned_hypothesis: i almost forgot to tell you about our next meeting
|
32
|
+
- :transcription: I am not using bash anymore
|
33
|
+
:hypothesis: if I am not using bash anymore
|
34
|
+
:insertions: 1
|
35
|
+
:deletions: 0
|
36
|
+
:substitutions: 0
|
37
|
+
:matching: 6
|
38
|
+
:align_cost: 1
|
39
|
+
:transcription_words: 6
|
40
|
+
:aligned_transcription: '*** i am not using bash anymore'
|
41
|
+
:aligned_hypothesis: IF i am not using bash anymore
|
42
|
+
- :transcription: I think I might switch to a windows computer
|
43
|
+
:hypothesis: I think it might switch to a windows computer
|
44
|
+
:insertions: 0
|
45
|
+
:deletions: 0
|
46
|
+
:substitutions: 1
|
47
|
+
:matching: 8
|
48
|
+
:align_cost: 1
|
49
|
+
:transcription_words: 9
|
50
|
+
:aligned_transcription: i think I might switch to a windows computer
|
51
|
+
:aligned_hypothesis: i think IT might switch to a windows computer
|
52
|
+
- :transcription: it returns the current hypothesis
|
53
|
+
:hypothesis: it returns the current hypothesis
|
54
|
+
:insertions: 0
|
55
|
+
:deletions: 0
|
56
|
+
:substitutions: 0
|
57
|
+
:matching: 5
|
58
|
+
:align_cost: 0
|
59
|
+
:transcription_words: 5
|
60
|
+
:aligned_transcription: it returns the current hypothesis
|
61
|
+
:aligned_hypothesis: it returns the current hypothesis
|
62
|
+
- :transcription: you cannot code HTML by voice
|
63
|
+
:hypothesis: you cannot code HTML by avoids
|
64
|
+
:insertions: 0
|
65
|
+
:deletions: 0
|
66
|
+
:substitutions: 1
|
67
|
+
:matching: 5
|
68
|
+
:align_cost: 1
|
69
|
+
:transcription_words: 6
|
70
|
+
:aligned_transcription: you cannot code html by VOICE
|
71
|
+
:aligned_hypothesis: you cannot code html by AVOIDS
|
72
|
+
- :transcription: why exactly would you do that
|
73
|
+
:hypothesis: why exactly would you go that
|
74
|
+
:insertions: 0
|
75
|
+
:deletions: 0
|
76
|
+
:substitutions: 1
|
77
|
+
:matching: 5
|
78
|
+
:align_cost: 1
|
79
|
+
:transcription_words: 6
|
80
|
+
:aligned_transcription: why exactly would you DO that
|
81
|
+
:aligned_hypothesis: why exactly would you GO that
|
82
|
+
- :transcription: I didn't think about it like that
|
83
|
+
:hypothesis: I didn't think about it like that
|
84
|
+
:insertions: 0
|
85
|
+
:deletions: 0
|
86
|
+
:substitutions: 0
|
87
|
+
:matching: 7
|
88
|
+
:align_cost: 0
|
89
|
+
:transcription_words: 7
|
90
|
+
:aligned_transcription: i didn't think about it like that
|
91
|
+
:aligned_hypothesis: i didn't think about it like that
|
92
|
+
- :transcription: haven't you figured out the meaning of the algorithm yet
|
93
|
+
:hypothesis: haven't you fake it out the meaning of the I'd go with him yet
|
94
|
+
:insertions: 4
|
95
|
+
:deletions: 0
|
96
|
+
:substitutions: 2
|
97
|
+
:matching: 8
|
98
|
+
:align_cost: 6
|
99
|
+
:transcription_words: 10
|
100
|
+
:aligned_transcription: haven't you *** FIGURED out the meaning of the *** ***
|
101
|
+
*** ALGORITHM yet
|
102
|
+
:aligned_hypothesis: haven't you FAKE IT out the meaning of the I'D GO WITH
|
103
|
+
HIM yet
|
104
|
+
- :transcription: the algorithm is quite good
|
105
|
+
:hypothesis: the algorithm is quite good
|
106
|
+
:insertions: 0
|
107
|
+
:deletions: 0
|
108
|
+
:substitutions: 0
|
109
|
+
:matching: 5
|
110
|
+
:align_cost: 0
|
111
|
+
:transcription_words: 5
|
112
|
+
:aligned_transcription: the algorithm is quite good
|
113
|
+
:aligned_hypothesis: the algorithm is quite good
|
114
|
+
- :transcription: it responds in a custom way
|
115
|
+
:hypothesis: it's response in a custom way
|
116
|
+
:insertions: 0
|
117
|
+
:deletions: 0
|
118
|
+
:substitutions: 2
|
119
|
+
:matching: 4
|
120
|
+
:align_cost: 2
|
121
|
+
:transcription_words: 6
|
122
|
+
:aligned_transcription: IT RESPONDS in a custom way
|
123
|
+
:aligned_hypothesis: IT'S RESPONSE in a custom way
|
124
|
+
- :transcription: it won't work for HTML
|
125
|
+
:hypothesis: it won't work for HTML
|
126
|
+
:insertions: 0
|
127
|
+
:deletions: 0
|
128
|
+
:substitutions: 0
|
129
|
+
:matching: 5
|
130
|
+
:align_cost: 0
|
131
|
+
:transcription_words: 5
|
132
|
+
:aligned_transcription: it won't work for html
|
133
|
+
:aligned_hypothesis: it won't work for html
|
134
|
+
- :transcription: but it will work good for ruby on rails
|
135
|
+
:hypothesis: but it will work good for ruby on rails
|
136
|
+
:insertions: 0
|
137
|
+
:deletions: 0
|
138
|
+
:substitutions: 0
|
139
|
+
:matching: 9
|
140
|
+
:align_cost: 0
|
141
|
+
:transcription_words: 9
|
142
|
+
:aligned_transcription: but it will work good for ruby on rails
|
143
|
+
:aligned_hypothesis: but it will work good for ruby on rails
|
144
|
+
- :transcription: the user would decide when to update
|
145
|
+
:hypothesis: but the user would decide when to app to
|
146
|
+
:insertions: 2
|
147
|
+
:deletions: 0
|
148
|
+
:substitutions: 1
|
149
|
+
:matching: 6
|
150
|
+
:align_cost: 3
|
151
|
+
:transcription_words: 7
|
152
|
+
:aligned_transcription: '*** the user would decide when to *** UPDATE'
|
153
|
+
:aligned_hypothesis: BUT the user would decide when to APP TO
|
154
|
+
- :transcription: when you define a new class it is not there
|
155
|
+
:hypothesis: when you defining you close it is not there
|
156
|
+
:insertions: 0
|
157
|
+
:deletions: 1
|
158
|
+
:substitutions: 3
|
159
|
+
:matching: 6
|
160
|
+
:align_cost: 4
|
161
|
+
:transcription_words: 10
|
162
|
+
:aligned_transcription: when you DEFINE A NEW CLASS it is not there
|
163
|
+
:aligned_hypothesis: when you *** DEFINING YOU CLOSE it is not there
|
164
|
+
- :transcription: so the voice recognition system would learn new classes while you
|
165
|
+
are defining them
|
166
|
+
:hypothesis: so the voice recognition system would learn new classes where you are
|
167
|
+
defining them for
|
168
|
+
:insertions: 1
|
169
|
+
:deletions: 0
|
170
|
+
:substitutions: 1
|
171
|
+
:matching: 13
|
172
|
+
:align_cost: 2
|
173
|
+
:transcription_words: 14
|
174
|
+
:aligned_transcription: so the voice recognition system would learn new classes
|
175
|
+
WHILE you are defining them ***
|
176
|
+
:aligned_hypothesis: so the voice recognition system would learn new classes WHERE
|
177
|
+
you are defining them FOR
|
178
|
+
- :transcription: and it would build a custom language model based on ctags
|
179
|
+
:hypothesis: and it would build a custom language model based on see Tex
|
180
|
+
:insertions: 1
|
181
|
+
:deletions: 0
|
182
|
+
:substitutions: 1
|
183
|
+
:matching: 10
|
184
|
+
:align_cost: 2
|
185
|
+
:transcription_words: 11
|
186
|
+
:aligned_transcription: and it would build a custom language model based on ***
|
187
|
+
CTAGS
|
188
|
+
:aligned_hypothesis: and it would build a custom language model based on SEE
|
189
|
+
TEX
|
190
|
+
- :transcription: so it understands all your classes and methods like words
|
191
|
+
:hypothesis: so it understands Oreo classes and methods like words
|
192
|
+
:insertions: 0
|
193
|
+
:deletions: 1
|
194
|
+
:substitutions: 1
|
195
|
+
:matching: 8
|
196
|
+
:align_cost: 2
|
197
|
+
:transcription_words: 10
|
198
|
+
:aligned_transcription: so it understands ALL YOUR classes and methods like words
|
199
|
+
:aligned_hypothesis: so it understands *** OREO classes and methods like words
|
200
|
+
- :transcription: I think this might be a good idea
|
201
|
+
:hypothesis: I'd think this might be a good idea
|
202
|
+
:insertions: 0
|
203
|
+
:deletions: 0
|
204
|
+
:substitutions: 1
|
205
|
+
:matching: 7
|
206
|
+
:align_cost: 1
|
207
|
+
:transcription_words: 8
|
208
|
+
:aligned_transcription: I think this might be a good idea
|
209
|
+
:aligned_hypothesis: I'D think this might be a good idea
|
210
|
+
- :transcription: I think this should be much faster
|
211
|
+
:hypothesis: I think the should be much faster
|
212
|
+
:insertions: 0
|
213
|
+
:deletions: 0
|
214
|
+
:substitutions: 1
|
215
|
+
:matching: 6
|
216
|
+
:align_cost: 1
|
217
|
+
:transcription_words: 7
|
218
|
+
:aligned_transcription: i think THIS should be much faster
|
219
|
+
:aligned_hypothesis: i think THE should be much faster
|
220
|
+
- :transcription: did you see my latest commit
|
221
|
+
:hypothesis: and opted you ca latest committed up
|
222
|
+
:insertions: 1
|
223
|
+
:deletions: 0
|
224
|
+
:substitutions: 5
|
225
|
+
:matching: 1
|
226
|
+
:align_cost: 6
|
227
|
+
:transcription_words: 6
|
228
|
+
:aligned_transcription: '*** DID you SEE MY LATEST COMMIT'
|
229
|
+
:aligned_hypothesis: AND OPTED you CA LATEST COMMITTED UP
|
230
|
+
- :transcription: click on first result
|
231
|
+
:hypothesis: click on first result
|
232
|
+
:insertions: 0
|
233
|
+
:deletions: 0
|
234
|
+
:substitutions: 0
|
235
|
+
:matching: 4
|
236
|
+
:align_cost: 0
|
237
|
+
:transcription_words: 4
|
238
|
+
:aligned_transcription: click on first result
|
239
|
+
:aligned_hypothesis: click on first result
|
240
|
+
- :transcription: click on second result
|
241
|
+
:hypothesis: click on second free soft
|
242
|
+
:insertions: 1
|
243
|
+
:deletions: 0
|
244
|
+
:substitutions: 1
|
245
|
+
:matching: 3
|
246
|
+
:align_cost: 2
|
247
|
+
:transcription_words: 4
|
248
|
+
:aligned_transcription: click on second *** RESULT
|
249
|
+
:aligned_hypothesis: click on second FREE SOFT
|
250
|
+
- :transcription: go to Google
|
251
|
+
:hypothesis: go to Google on
|
252
|
+
:insertions: 1
|
253
|
+
:deletions: 0
|
254
|
+
:substitutions: 0
|
255
|
+
:matching: 3
|
256
|
+
:align_cost: 1
|
257
|
+
:transcription_words: 3
|
258
|
+
:aligned_transcription: go to google ***
|
259
|
+
:aligned_hypothesis: go to google ON
|
260
|
+
- :transcription: validates presence of name
|
261
|
+
:hypothesis: valid dates presence of name
|
262
|
+
:insertions: 1
|
263
|
+
:deletions: 0
|
264
|
+
:substitutions: 1
|
265
|
+
:matching: 3
|
266
|
+
:align_cost: 2
|
267
|
+
:transcription_words: 4
|
268
|
+
:aligned_transcription: '*** VALIDATES presence of name'
|
269
|
+
:aligned_hypothesis: VALID DATES presence of name
|
270
|
+
- :transcription: validates uniqueness of name
|
271
|
+
:hypothesis: wedded its uniqueness often name
|
272
|
+
:insertions: 1
|
273
|
+
:deletions: 0
|
274
|
+
:substitutions: 2
|
275
|
+
:matching: 2
|
276
|
+
:align_cost: 3
|
277
|
+
:transcription_words: 4
|
278
|
+
:aligned_transcription: '*** VALIDATES uniqueness OF name'
|
279
|
+
:aligned_hypothesis: WEDDED ITS uniqueness OFTEN name
|
280
|
+
- :transcription: belongs to language
|
281
|
+
:hypothesis: belongs to language
|
282
|
+
:insertions: 0
|
283
|
+
:deletions: 0
|
284
|
+
:substitutions: 0
|
285
|
+
:matching: 3
|
286
|
+
:align_cost: 0
|
287
|
+
:transcription_words: 3
|
288
|
+
:aligned_transcription: belongs to language
|
289
|
+
:aligned_hypothesis: belongs to language
|
290
|
+
- :transcription: belongs to user
|
291
|
+
:hypothesis: belongs to user
|
292
|
+
:insertions: 0
|
293
|
+
:deletions: 0
|
294
|
+
:substitutions: 0
|
295
|
+
:matching: 3
|
296
|
+
:align_cost: 0
|
297
|
+
:transcription_words: 3
|
298
|
+
:aligned_transcription: belongs to user
|
299
|
+
:aligned_hypothesis: belongs to user
|
300
|
+
- :transcription: it should have three actions
|
301
|
+
:hypothesis: it should have three actions
|
302
|
+
:insertions: 0
|
303
|
+
:deletions: 0
|
304
|
+
:substitutions: 0
|
305
|
+
:matching: 5
|
306
|
+
:align_cost: 0
|
307
|
+
:transcription_words: 5
|
308
|
+
:aligned_transcription: it should have three actions
|
309
|
+
:aligned_hypothesis: it should have three actions
|
310
|
+
- :transcription: I didn't think this would work
|
311
|
+
:hypothesis: I didn't think this would work
|
312
|
+
:insertions: 0
|
313
|
+
:deletions: 0
|
314
|
+
:substitutions: 0
|
315
|
+
:matching: 6
|
316
|
+
:align_cost: 0
|
317
|
+
:transcription_words: 6
|
318
|
+
:aligned_transcription: i didn't think this would work
|
319
|
+
:aligned_hypothesis: i didn't think this would work
|
320
|
+
- :transcription: I am now testing another recording
|
321
|
+
:hypothesis: I am not testing another recording
|
322
|
+
:insertions: 0
|
323
|
+
:deletions: 0
|
324
|
+
:substitutions: 1
|
325
|
+
:matching: 5
|
326
|
+
:align_cost: 1
|
327
|
+
:transcription_words: 6
|
328
|
+
:aligned_transcription: i am NOW testing another recording
|
329
|
+
:aligned_hypothesis: i am NOT testing another recording
|
330
|
+
- :transcription: hello and welcome
|
331
|
+
:hypothesis: hello and welcome
|
332
|
+
:insertions: 0
|
333
|
+
:deletions: 0
|
334
|
+
:substitutions: 0
|
335
|
+
:matching: 3
|
336
|
+
:align_cost: 0
|
337
|
+
:transcription_words: 3
|
338
|
+
:aligned_transcription: hello and welcome
|
339
|
+
:aligned_hypothesis: hello and welcome
|
340
|
+
- :transcription: by the way everything that you have just read was recognized by
|
341
|
+
my software
|
342
|
+
:hypothesis: by the way everything that you have just read was recognized by my
|
343
|
+
software
|
344
|
+
:insertions: 0
|
345
|
+
:deletions: 0
|
346
|
+
:substitutions: 0
|
347
|
+
:matching: 14
|
348
|
+
:align_cost: 0
|
349
|
+
:transcription_words: 14
|
350
|
+
:aligned_transcription: by the way everything that you have just read was recognized
|
351
|
+
by my software
|
352
|
+
:aligned_hypothesis: by the way everything that you have just read was recognized
|
353
|
+
by my software
|
354
|
+
- :transcription: with only minor errors in the recognition
|
355
|
+
:hypothesis: with only minor errors in the recognition
|
356
|
+
:insertions: 0
|
357
|
+
:deletions: 0
|
358
|
+
:substitutions: 0
|
359
|
+
:matching: 7
|
360
|
+
:align_cost: 0
|
361
|
+
:transcription_words: 7
|
362
|
+
:aligned_transcription: with only minor errors in the recognition
|
363
|
+
:aligned_hypothesis: with only minor errors in the recognition
|
364
|
+
- :transcription: please fetch the files from the server
|
365
|
+
:hypothesis: please search the files from the server
|
366
|
+
:insertions: 0
|
367
|
+
:deletions: 0
|
368
|
+
:substitutions: 1
|
369
|
+
:matching: 6
|
370
|
+
:align_cost: 1
|
371
|
+
:transcription_words: 7
|
372
|
+
:aligned_transcription: please FETCH the files from the server
|
373
|
+
:aligned_hypothesis: please SEARCH the files from the server
|
374
|
+
- :transcription: the real challenge is coming up with a good speech representation
|
375
|
+
of ruby
|
376
|
+
:hypothesis: the real challenge is coming up with Blake good speech representation
|
377
|
+
of rube
|
378
|
+
:insertions: 0
|
379
|
+
:deletions: 0
|
380
|
+
:substitutions: 2
|
381
|
+
:matching: 11
|
382
|
+
:align_cost: 2
|
383
|
+
:transcription_words: 13
|
384
|
+
:aligned_transcription: the real challenge is coming up with A good speech
|
385
|
+
representation of RUBY
|
386
|
+
:aligned_hypothesis: the real challenge is coming up with BLAKE good speech representation
|
387
|
+
of RUBE
|
388
|
+
- :transcription: that follows the principle of least surprise
|
389
|
+
:hypothesis: that follows the principle of the surprise
|
390
|
+
:insertions: 0
|
391
|
+
:deletions: 0
|
392
|
+
:substitutions: 1
|
393
|
+
:matching: 6
|
394
|
+
:align_cost: 1
|
395
|
+
:transcription_words: 7
|
396
|
+
:aligned_transcription: that follows the principle of LEAST surprise
|
397
|
+
:aligned_hypothesis: that follows the principle of THE surprise
|
398
|
+
- :transcription: and deals appropriately with ambiguous cases
|
399
|
+
:hypothesis: and this appropriately it was ambiguous cases
|
400
|
+
:insertions: 1
|
401
|
+
:deletions: 0
|
402
|
+
:substitutions: 2
|
403
|
+
:matching: 4
|
404
|
+
:align_cost: 3
|
405
|
+
:transcription_words: 6
|
406
|
+
:aligned_transcription: and DEALS appropriately *** WITH ambiguous cases
|
407
|
+
:aligned_hypothesis: and THIS appropriately IT WAS ambiguous cases
|
408
|
+
- :transcription: the good thing is that you can get rid of a lot of manual work
|
409
|
+
:hypothesis: the good thing is that you can get rid of a lot of manual work
|
410
|
+
:insertions: 0
|
411
|
+
:deletions: 0
|
412
|
+
:substitutions: 0
|
413
|
+
:matching: 15
|
414
|
+
:align_cost: 0
|
415
|
+
:transcription_words: 15
|
416
|
+
:aligned_transcription: the good thing is that you can get rid of a lot of manual
|
417
|
+
work
|
418
|
+
:aligned_hypothesis: the good thing is that you can get rid of a lot of manual
|
419
|
+
work
|
420
|
+
- :transcription: for example attribute accessors are nearly always placed at the
|
421
|
+
top of the file
|
422
|
+
:hypothesis: for example attribute excesses are nearly always placed at the top
|
423
|
+
of the file
|
424
|
+
:insertions: 0
|
425
|
+
:deletions: 0
|
426
|
+
:substitutions: 1
|
427
|
+
:matching: 13
|
428
|
+
:align_cost: 1
|
429
|
+
:transcription_words: 14
|
430
|
+
:aligned_transcription: for example attribute ACCESSORS are nearly always placed
|
431
|
+
at the top of the file
|
432
|
+
:aligned_hypothesis: for example attribute EXCESSES are nearly always placed at the
|
433
|
+
top of the file
|
434
|
+
- :transcription: so when you say something like
|
435
|
+
:hypothesis: so when you say something like
|
436
|
+
:insertions: 0
|
437
|
+
:deletions: 0
|
438
|
+
:substitutions: 0
|
439
|
+
:matching: 6
|
440
|
+
:align_cost: 0
|
441
|
+
:transcription_words: 6
|
442
|
+
:aligned_transcription: so when you say something like
|
443
|
+
:aligned_hypothesis: so when you say something like
|
444
|
+
- :transcription: attribute accessor file name
|
445
|
+
:hypothesis: attribute access server fine name
|
446
|
+
:insertions: 1
|
447
|
+
:deletions: 0
|
448
|
+
:substitutions: 2
|
449
|
+
:matching: 2
|
450
|
+
:align_cost: 3
|
451
|
+
:transcription_words: 4
|
452
|
+
:aligned_transcription: attribute *** ACCESSOR FILE name
|
453
|
+
:aligned_hypothesis: attribute ACCESS SERVER FINE name
|
454
|
+
- :transcription: it will automatically put the following line at the top of the file
|
455
|
+
in the right place
|
456
|
+
:hypothesis: it will automatically put the following line at the top of the file
|
457
|
+
in the right place
|
458
|
+
:insertions: 0
|
459
|
+
:deletions: 0
|
460
|
+
:substitutions: 0
|
461
|
+
:matching: 17
|
462
|
+
:align_cost: 0
|
463
|
+
:transcription_words: 17
|
464
|
+
:aligned_transcription: it will automatically put the following line at the top
|
465
|
+
of the file in the right place
|
466
|
+
:aligned_hypothesis: it will automatically put the following line at the top of the
|
467
|
+
file in the right place
|
468
|
+
- :transcription: so the whole approach works only with one unified style
|
469
|
+
:hypothesis: so the whole approach works only with one unified style
|
470
|
+
:insertions: 0
|
471
|
+
:deletions: 0
|
472
|
+
:substitutions: 0
|
473
|
+
:matching: 10
|
474
|
+
:align_cost: 0
|
475
|
+
:transcription_words: 10
|
476
|
+
:aligned_transcription: so the whole approach works only with one unified style
|
477
|
+
:aligned_hypothesis: so the whole approach works only with one unified style
|
478
|
+
- :transcription: which is the ruby best practices style which is published on github
|
479
|
+
:hypothesis: which is did ruby best practices style which is published on git help
|
480
|
+
:insertions: 1
|
481
|
+
:deletions: 0
|
482
|
+
:substitutions: 2
|
483
|
+
:matching: 10
|
484
|
+
:align_cost: 3
|
485
|
+
:transcription_words: 12
|
486
|
+
:aligned_transcription: which is THE ruby best practices style which is published
|
487
|
+
on *** GITHUB
|
488
|
+
:aligned_hypothesis: which is DID ruby best practices style which is published
|
489
|
+
on GIT HELP
|
490
|
+
- :transcription: I wonder if I should create a custom language model just for programming
|
491
|
+
:hypothesis: I wonder if I should create a custom language will just for programming
|
492
|
+
:insertions: 0
|
493
|
+
:deletions: 0
|
494
|
+
:substitutions: 1
|
495
|
+
:matching: 12
|
496
|
+
:align_cost: 1
|
497
|
+
:transcription_words: 13
|
498
|
+
:aligned_transcription: i wonder if i should create a custom language MODEL
|
499
|
+
just for programming
|
500
|
+
:aligned_hypothesis: i wonder if i should create a custom language WILL just
|
501
|
+
for programming
|
502
|
+
- :transcription: or if I should use the normal dictation model and just train it
|
503
|
+
for programming
|
504
|
+
:hypothesis: or if I should use that normal dictation on model and just train it
|
505
|
+
for programming
|
506
|
+
:insertions: 1
|
507
|
+
:deletions: 0
|
508
|
+
:substitutions: 1
|
509
|
+
:matching: 14
|
510
|
+
:align_cost: 2
|
511
|
+
:transcription_words: 15
|
512
|
+
:aligned_transcription: or if i should use THE normal dictation *** model and
|
513
|
+
just train it for programming
|
514
|
+
:aligned_hypothesis: or if i should use THAT normal dictation ON model and
|
515
|
+
just train it for programming
|
516
|
+
- :transcription: the advantage is that it would also recognize normal sentences such
|
517
|
+
as commit messages
|
518
|
+
:hypothesis: the advantage is that it would also recognized normal sentences such
|
519
|
+
as commit messages
|
520
|
+
:insertions: 0
|
521
|
+
:deletions: 0
|
522
|
+
:substitutions: 1
|
523
|
+
:matching: 13
|
524
|
+
:align_cost: 1
|
525
|
+
:transcription_words: 14
|
526
|
+
:aligned_transcription: the advantage is that it would also RECOGNIZE normal
|
527
|
+
sentences such as commit messages
|
528
|
+
:aligned_hypothesis: the advantage is that it would also RECOGNIZED normal sentences
|
529
|
+
such as commit messages
|
530
|
+
- :transcription: while the disadvantage is that it would not work as accurate on
|
531
|
+
programming messages
|
532
|
+
:hypothesis: why the disadvantage is that it would not work as a correct on programming
|
533
|
+
messages
|
534
|
+
:insertions: 1
|
535
|
+
:deletions: 0
|
536
|
+
:substitutions: 2
|
537
|
+
:matching: 12
|
538
|
+
:align_cost: 3
|
539
|
+
:transcription_words: 14
|
540
|
+
:aligned_transcription: WHILE the disadvantage is that it would not work as ***
|
541
|
+
ACCURATE on programming messages
|
542
|
+
:aligned_hypothesis: WHY the disadvantage is that it would not work as A CORRECT on programming
|
543
|
+
messages
|
544
|
+
- :transcription: another idea would be a hybrid approach
|
545
|
+
:hypothesis: another idea would be a hybrid approach
|
546
|
+
:insertions: 0
|
547
|
+
:deletions: 0
|
548
|
+
:substitutions: 0
|
549
|
+
:matching: 7
|
550
|
+
:align_cost: 0
|
551
|
+
:transcription_words: 7
|
552
|
+
:aligned_transcription: another idea would be a hybrid approach
|
553
|
+
:aligned_hypothesis: another idea would be a hybrid approach
|
554
|
+
- :transcription: that means whenever you are entering a string value or a commit
|
555
|
+
message it would switch automatically to the dictation language model
|
556
|
+
:hypothesis: that means whenever you are entering any string value or a commit message
|
557
|
+
it would switch automatically to the dictation language more
|
558
|
+
:insertions: 0
|
559
|
+
:deletions: 0
|
560
|
+
:substitutions: 2
|
561
|
+
:matching: 20
|
562
|
+
:align_cost: 2
|
563
|
+
:transcription_words: 22
|
564
|
+
:aligned_transcription: that means whenever you are entering A string value or a commit
|
565
|
+
message it would switch automatically to the dictation language MODEL
|
566
|
+
:aligned_hypothesis: that means whenever you are entering ANY string value or a commit
|
567
|
+
message it would switch automatically to the dictation language MORE
|
568
|
+
- :transcription: I am not sure if Google's voice recognition is actually that good
|
569
|
+
:hypothesis: I am not sure it's Google's voice recognition is actually dead code
|
570
|
+
:insertions: 0
|
571
|
+
:deletions: 0
|
572
|
+
:substitutions: 3
|
573
|
+
:matching: 9
|
574
|
+
:align_cost: 3
|
575
|
+
:transcription_words: 12
|
576
|
+
:aligned_transcription: i am not sure IF google's voice recognition is actually
|
577
|
+
THAT GOOD
|
578
|
+
:aligned_hypothesis: i am not sure IT'S google's voice recognition is actually
|
579
|
+
DEAD CODE
|
580
|
+
- :transcription: we can wait no problem
|
581
|
+
:hypothesis: we can wait no problem
|
582
|
+
:insertions: 0
|
583
|
+
:deletions: 0
|
584
|
+
:substitutions: 0
|
585
|
+
:matching: 5
|
586
|
+
:align_cost: 0
|
587
|
+
:transcription_words: 5
|
588
|
+
:aligned_transcription: we can wait no problem
|
589
|
+
:aligned_hypothesis: we can wait no problem
|
590
|
+
- :transcription: wow it's really fast
|
591
|
+
:hypothesis: wow it's really fast
|
592
|
+
:insertions: 0
|
593
|
+
:deletions: 0
|
594
|
+
:substitutions: 0
|
595
|
+
:matching: 4
|
596
|
+
:align_cost: 0
|
597
|
+
:transcription_words: 4
|
598
|
+
:aligned_transcription: wow it's really fast
|
599
|
+
:aligned_hypothesis: wow it's really fast
|
600
|
+
- :transcription: the URL is different
|
601
|
+
:hypothesis: do you are at it is different
|
602
|
+
:insertions: 3
|
603
|
+
:deletions: 0
|
604
|
+
:substitutions: 2
|
605
|
+
:matching: 2
|
606
|
+
:align_cost: 5
|
607
|
+
:transcription_words: 4
|
608
|
+
:aligned_transcription: '*** *** *** THE URL is different'
|
609
|
+
:aligned_hypothesis: DO YOU ARE AT IT is different
|
610
|
+
- :transcription: we would basically just need to change that
|
611
|
+
:hypothesis: we would basically just need to change that
|
612
|
+
:insertions: 0
|
613
|
+
:deletions: 0
|
614
|
+
:substitutions: 0
|
615
|
+
:matching: 8
|
616
|
+
:align_cost: 0
|
617
|
+
:transcription_words: 8
|
618
|
+
:aligned_transcription: we would basically just need to change that
|
619
|
+
:aligned_hypothesis: we would basically just need to change that
|
620
|
+
- :transcription: logos are symbols that attempt to visually represent the essence
|
621
|
+
of an organization
|
622
|
+
:hypothesis: logos are symbols that attempt to visually represent the essence of
|
623
|
+
an organization
|
624
|
+
:insertions: 0
|
625
|
+
:deletions: 0
|
626
|
+
:substitutions: 0
|
627
|
+
:matching: 13
|
628
|
+
:align_cost: 0
|
629
|
+
:transcription_words: 13
|
630
|
+
:aligned_transcription: logos are symbols that attempt to visually represent the
|
631
|
+
essence of an organization
|
632
|
+
:aligned_hypothesis: logos are symbols that attempt to visually represent the essence
|
633
|
+
of an organization
|
634
|
+
- :transcription: given that the new yahoo logo is a blandly cooperate humourless
|
635
|
+
confused jumble of unappealing elements
|
636
|
+
:hypothesis: given that the new yahoo rule is a plan to cooperate to over less confused
|
637
|
+
jumble of an unappealing elements
|
638
|
+
:insertions: 4
|
639
|
+
:deletions: 0
|
640
|
+
:substitutions: 3
|
641
|
+
:matching: 13
|
642
|
+
:align_cost: 7
|
643
|
+
:transcription_words: 16
|
644
|
+
:aligned_transcription: given that the new yahoo LOGO is a *** BLANDLY cooperate
|
645
|
+
*** *** HUMOURLESS confused jumble of *** unappealing elements
|
646
|
+
:aligned_hypothesis: given that the new yahoo RULE is a PLAN TO cooperate
|
647
|
+
TO OVER LESS confused jumble of AN unappealing elements
|