word_aligner 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/Gemfile +18 -0
- data/Gemfile.lock +115 -0
- data/Guardfile +8 -0
- data/LICENSE.txt +20 -0
- data/README.md +45 -0
- data/Rakefile +45 -0
- data/VERSION +1 -0
- data/lib/word_aligner.rb +11 -0
- data/lib/word_aligner/aligner.rb +170 -0
- data/lib/word_aligner/word_error_rate.rb +44 -0
- data/spec/lib/word_aligner/aligner_spec.rb +31 -0
- data/spec/lib/word_aligner/word_error_rate_spec.rb +28 -0
- data/spec/lib/word_aligner_spec.rb +15 -0
- data/spec/sample_data/grab_for_comparision.rb +51 -0
- data/spec/sample_data/regression/sentences.yml +647 -0
- data/spec/sample_data/source_data/enough.hypotheses.txt +1 -0
- data/spec/sample_data/source_data/enough.txt +1 -0
- data/spec/sample_data/source_data/exactly_data.hypotheses.txt +1 -0
- data/spec/sample_data/source_data/exactly_data.txt +1 -0
- data/spec/sample_data/source_data/hamlet.hypotheses.txt +2 -0
- data/spec/sample_data/source_data/hamlet.txt +2 -0
- data/spec/sample_data/source_data/that_might.hypotheses.txt +1 -0
- data/spec/sample_data/source_data/that_might.txt +1 -0
- data/spec/sample_data/word_align.pl +302 -0
- data/spec/spec_helper.rb +20 -0
- metadata +143 -0
@@ -0,0 +1,44 @@
|
|
1
|
+
require 'ostruct'
|
2
|
+
|
3
|
+
module WordAligner
|
4
|
+
class WordErrorRate < OpenStruct
|
5
|
+
|
6
|
+
attr_reader :data
|
7
|
+
|
8
|
+
def initialize(data)
|
9
|
+
@data = data
|
10
|
+
super(data)
|
11
|
+
end
|
12
|
+
|
13
|
+
def words
|
14
|
+
transcription_words
|
15
|
+
end
|
16
|
+
|
17
|
+
def correct_words
|
18
|
+
matching
|
19
|
+
end
|
20
|
+
|
21
|
+
def incorrect_words
|
22
|
+
align_cost
|
23
|
+
end
|
24
|
+
|
25
|
+
def percentage_accurate
|
26
|
+
100-percentage_incorrect
|
27
|
+
end
|
28
|
+
|
29
|
+
def percentage_correct
|
30
|
+
percent_rate(correct_words)
|
31
|
+
end
|
32
|
+
|
33
|
+
def percentage_incorrect
|
34
|
+
percent_rate(incorrect_words)
|
35
|
+
end
|
36
|
+
|
37
|
+
private
|
38
|
+
|
39
|
+
def percent_rate(value)
|
40
|
+
value * 100.0 / [ words, 1].max
|
41
|
+
end
|
42
|
+
|
43
|
+
end
|
44
|
+
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
class Sample < OpenStruct
|
4
|
+
def aligner_result
|
5
|
+
@aligner_result ||= WordAligner::Aligner.new(transcription, hypothesis)
|
6
|
+
.word_error_rate.data
|
7
|
+
end
|
8
|
+
end
|
9
|
+
|
10
|
+
module WordAligner
|
11
|
+
describe Aligner do
|
12
|
+
|
13
|
+
samples = YAML.load File.read('spec/sample_data/regression/sentences.yml')
|
14
|
+
|
15
|
+
samples.each_with_index do |sample, idx|
|
16
|
+
|
17
|
+
describe "sample #{idx}" do
|
18
|
+
subject { Sample.new(sample) }
|
19
|
+
|
20
|
+
its(:hypothesis) { should_not be_nil }
|
21
|
+
its(:hypothesis) { should_not be_empty }
|
22
|
+
|
23
|
+
its(:transcription) { should_not be_nil }
|
24
|
+
its(:transcription) { should_not be_empty }
|
25
|
+
|
26
|
+
its(:aligner_result) { should eq sample }
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
module WordAligner
|
4
|
+
describe WordErrorRate do
|
5
|
+
|
6
|
+
let(:data) do
|
7
|
+
{
|
8
|
+
insertions: 8,
|
9
|
+
substitutions: 2,
|
10
|
+
deletions: 0,
|
11
|
+
align_cost: 2,
|
12
|
+
transcription_words: 8,
|
13
|
+
matching: 6
|
14
|
+
}
|
15
|
+
end
|
16
|
+
|
17
|
+
subject { WordErrorRate.new(data) }
|
18
|
+
|
19
|
+
it { should be_a(WordErrorRate) }
|
20
|
+
|
21
|
+
its(:words) { should eq(8) }
|
22
|
+
its(:correct_words) { should eq(6) }
|
23
|
+
its(:incorrect_words) { should eq(2) }
|
24
|
+
its(:percentage_correct) { should eq(75.0) }
|
25
|
+
its(:percentage_incorrect) { should eq(25.0) }
|
26
|
+
its(:percentage_accurate) { should eq(75.0) }
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,51 @@
|
|
1
|
+
# USAGE: ruby grab_for_comparision.rb regression/sentences.txt.wa \
|
2
|
+
# regression/sentences.hypotheses.txt.wa > regression/sentences.yml
|
3
|
+
require 'yaml'
|
4
|
+
|
5
|
+
transcription_file, hypothesis_file = ARGV
|
6
|
+
command = "perl word_align.pl #{transcription_file} #{hypothesis_file}"
|
7
|
+
output = `#{command}`.split("\n").map(&:strip)
|
8
|
+
|
9
|
+
transcription_lines = File.readlines(transcription_file)
|
10
|
+
hypothesis_lines = File.readlines(hypothesis_file)
|
11
|
+
|
12
|
+
def strip_id(str)
|
13
|
+
str.sub(/\s*\(.+?\)\s*$/, '')
|
14
|
+
end
|
15
|
+
|
16
|
+
records = []
|
17
|
+
|
18
|
+
loop do
|
19
|
+
transcription = output.shift
|
20
|
+
hypothesis = output.shift
|
21
|
+
statistics = output.shift
|
22
|
+
distance = output.shift
|
23
|
+
|
24
|
+
break unless statistics.match(/Words:/)
|
25
|
+
|
26
|
+
words, correct, errors,
|
27
|
+
percentage_correct, error, accuracy = statistics.scan(/\d+(?:[.]\d+)?/)
|
28
|
+
insertions, deletions, substitutions = distance.scan(/\d+/)
|
29
|
+
|
30
|
+
transcription_line = strip_id(transcription_lines.shift.strip)
|
31
|
+
hypothesis_line = strip_id(hypothesis_lines.shift.strip)
|
32
|
+
|
33
|
+
next if transcription_line.empty?
|
34
|
+
|
35
|
+
details = {
|
36
|
+
transcription: transcription_line,
|
37
|
+
hypothesis: hypothesis_line,
|
38
|
+
insertions: insertions.to_i,
|
39
|
+
deletions: deletions.to_i,
|
40
|
+
substitutions: substitutions.to_i,
|
41
|
+
matching: correct.to_i,
|
42
|
+
align_cost: errors.to_i,
|
43
|
+
transcription_words: words.to_i,
|
44
|
+
aligned_transcription: strip_id(transcription),
|
45
|
+
aligned_hypothesis: strip_id(hypothesis)
|
46
|
+
}
|
47
|
+
|
48
|
+
records << details
|
49
|
+
end
|
50
|
+
|
51
|
+
puts records.to_yaml
|
@@ -0,0 +1,647 @@
|
|
1
|
+
---
|
2
|
+
- :transcription: I think this might just work for fine
|
3
|
+
:hypothesis: I'd think this might just work for laine
|
4
|
+
:insertions: 0
|
5
|
+
:deletions: 0
|
6
|
+
:substitutions: 2
|
7
|
+
:matching: 6
|
8
|
+
:align_cost: 2
|
9
|
+
:transcription_words: 8
|
10
|
+
:aligned_transcription: I think this might just work for FINE
|
11
|
+
:aligned_hypothesis: I'D think this might just work for LAINE
|
12
|
+
- :transcription: did you register for a new account
|
13
|
+
:hypothesis: hit you register for a new account
|
14
|
+
:insertions: 0
|
15
|
+
:deletions: 0
|
16
|
+
:substitutions: 1
|
17
|
+
:matching: 6
|
18
|
+
:align_cost: 1
|
19
|
+
:transcription_words: 7
|
20
|
+
:aligned_transcription: DID you register for a new account
|
21
|
+
:aligned_hypothesis: HIT you register for a new account
|
22
|
+
- :transcription: I almost forgot to tell you about our next meeting
|
23
|
+
:hypothesis: I almost forgot to tell you about our next meeting
|
24
|
+
:insertions: 0
|
25
|
+
:deletions: 0
|
26
|
+
:substitutions: 0
|
27
|
+
:matching: 10
|
28
|
+
:align_cost: 0
|
29
|
+
:transcription_words: 10
|
30
|
+
:aligned_transcription: i almost forgot to tell you about our next meeting
|
31
|
+
:aligned_hypothesis: i almost forgot to tell you about our next meeting
|
32
|
+
- :transcription: I am not using bash anymore
|
33
|
+
:hypothesis: if I am not using bash anymore
|
34
|
+
:insertions: 1
|
35
|
+
:deletions: 0
|
36
|
+
:substitutions: 0
|
37
|
+
:matching: 6
|
38
|
+
:align_cost: 1
|
39
|
+
:transcription_words: 6
|
40
|
+
:aligned_transcription: '*** i am not using bash anymore'
|
41
|
+
:aligned_hypothesis: IF i am not using bash anymore
|
42
|
+
- :transcription: I think I might switch to a windows computer
|
43
|
+
:hypothesis: I think it might switch to a windows computer
|
44
|
+
:insertions: 0
|
45
|
+
:deletions: 0
|
46
|
+
:substitutions: 1
|
47
|
+
:matching: 8
|
48
|
+
:align_cost: 1
|
49
|
+
:transcription_words: 9
|
50
|
+
:aligned_transcription: i think I might switch to a windows computer
|
51
|
+
:aligned_hypothesis: i think IT might switch to a windows computer
|
52
|
+
- :transcription: it returns the current hypothesis
|
53
|
+
:hypothesis: it returns the current hypothesis
|
54
|
+
:insertions: 0
|
55
|
+
:deletions: 0
|
56
|
+
:substitutions: 0
|
57
|
+
:matching: 5
|
58
|
+
:align_cost: 0
|
59
|
+
:transcription_words: 5
|
60
|
+
:aligned_transcription: it returns the current hypothesis
|
61
|
+
:aligned_hypothesis: it returns the current hypothesis
|
62
|
+
- :transcription: you cannot code HTML by voice
|
63
|
+
:hypothesis: you cannot code HTML by avoids
|
64
|
+
:insertions: 0
|
65
|
+
:deletions: 0
|
66
|
+
:substitutions: 1
|
67
|
+
:matching: 5
|
68
|
+
:align_cost: 1
|
69
|
+
:transcription_words: 6
|
70
|
+
:aligned_transcription: you cannot code html by VOICE
|
71
|
+
:aligned_hypothesis: you cannot code html by AVOIDS
|
72
|
+
- :transcription: why exactly would you do that
|
73
|
+
:hypothesis: why exactly would you go that
|
74
|
+
:insertions: 0
|
75
|
+
:deletions: 0
|
76
|
+
:substitutions: 1
|
77
|
+
:matching: 5
|
78
|
+
:align_cost: 1
|
79
|
+
:transcription_words: 6
|
80
|
+
:aligned_transcription: why exactly would you DO that
|
81
|
+
:aligned_hypothesis: why exactly would you GO that
|
82
|
+
- :transcription: I didn't think about it like that
|
83
|
+
:hypothesis: I didn't think about it like that
|
84
|
+
:insertions: 0
|
85
|
+
:deletions: 0
|
86
|
+
:substitutions: 0
|
87
|
+
:matching: 7
|
88
|
+
:align_cost: 0
|
89
|
+
:transcription_words: 7
|
90
|
+
:aligned_transcription: i didn't think about it like that
|
91
|
+
:aligned_hypothesis: i didn't think about it like that
|
92
|
+
- :transcription: haven't you figured out the meaning of the algorithm yet
|
93
|
+
:hypothesis: haven't you fake it out the meaning of the I'd go with him yet
|
94
|
+
:insertions: 4
|
95
|
+
:deletions: 0
|
96
|
+
:substitutions: 2
|
97
|
+
:matching: 8
|
98
|
+
:align_cost: 6
|
99
|
+
:transcription_words: 10
|
100
|
+
:aligned_transcription: haven't you *** FIGURED out the meaning of the *** ***
|
101
|
+
*** ALGORITHM yet
|
102
|
+
:aligned_hypothesis: haven't you FAKE IT out the meaning of the I'D GO WITH
|
103
|
+
HIM yet
|
104
|
+
- :transcription: the algorithm is quite good
|
105
|
+
:hypothesis: the algorithm is quite good
|
106
|
+
:insertions: 0
|
107
|
+
:deletions: 0
|
108
|
+
:substitutions: 0
|
109
|
+
:matching: 5
|
110
|
+
:align_cost: 0
|
111
|
+
:transcription_words: 5
|
112
|
+
:aligned_transcription: the algorithm is quite good
|
113
|
+
:aligned_hypothesis: the algorithm is quite good
|
114
|
+
- :transcription: it responds in a custom way
|
115
|
+
:hypothesis: it's response in a custom way
|
116
|
+
:insertions: 0
|
117
|
+
:deletions: 0
|
118
|
+
:substitutions: 2
|
119
|
+
:matching: 4
|
120
|
+
:align_cost: 2
|
121
|
+
:transcription_words: 6
|
122
|
+
:aligned_transcription: IT RESPONDS in a custom way
|
123
|
+
:aligned_hypothesis: IT'S RESPONSE in a custom way
|
124
|
+
- :transcription: it won't work for HTML
|
125
|
+
:hypothesis: it won't work for HTML
|
126
|
+
:insertions: 0
|
127
|
+
:deletions: 0
|
128
|
+
:substitutions: 0
|
129
|
+
:matching: 5
|
130
|
+
:align_cost: 0
|
131
|
+
:transcription_words: 5
|
132
|
+
:aligned_transcription: it won't work for html
|
133
|
+
:aligned_hypothesis: it won't work for html
|
134
|
+
- :transcription: but it will work good for ruby on rails
|
135
|
+
:hypothesis: but it will work good for ruby on rails
|
136
|
+
:insertions: 0
|
137
|
+
:deletions: 0
|
138
|
+
:substitutions: 0
|
139
|
+
:matching: 9
|
140
|
+
:align_cost: 0
|
141
|
+
:transcription_words: 9
|
142
|
+
:aligned_transcription: but it will work good for ruby on rails
|
143
|
+
:aligned_hypothesis: but it will work good for ruby on rails
|
144
|
+
- :transcription: the user would decide when to update
|
145
|
+
:hypothesis: but the user would decide when to app to
|
146
|
+
:insertions: 2
|
147
|
+
:deletions: 0
|
148
|
+
:substitutions: 1
|
149
|
+
:matching: 6
|
150
|
+
:align_cost: 3
|
151
|
+
:transcription_words: 7
|
152
|
+
:aligned_transcription: '*** the user would decide when to *** UPDATE'
|
153
|
+
:aligned_hypothesis: BUT the user would decide when to APP TO
|
154
|
+
- :transcription: when you define a new class it is not there
|
155
|
+
:hypothesis: when you defining you close it is not there
|
156
|
+
:insertions: 0
|
157
|
+
:deletions: 1
|
158
|
+
:substitutions: 3
|
159
|
+
:matching: 6
|
160
|
+
:align_cost: 4
|
161
|
+
:transcription_words: 10
|
162
|
+
:aligned_transcription: when you DEFINE A NEW CLASS it is not there
|
163
|
+
:aligned_hypothesis: when you *** DEFINING YOU CLOSE it is not there
|
164
|
+
- :transcription: so the voice recognition system would learn new classes while you
|
165
|
+
are defining them
|
166
|
+
:hypothesis: so the voice recognition system would learn new classes where you are
|
167
|
+
defining them for
|
168
|
+
:insertions: 1
|
169
|
+
:deletions: 0
|
170
|
+
:substitutions: 1
|
171
|
+
:matching: 13
|
172
|
+
:align_cost: 2
|
173
|
+
:transcription_words: 14
|
174
|
+
:aligned_transcription: so the voice recognition system would learn new classes
|
175
|
+
WHILE you are defining them ***
|
176
|
+
:aligned_hypothesis: so the voice recognition system would learn new classes WHERE
|
177
|
+
you are defining them FOR
|
178
|
+
- :transcription: and it would build a custom language model based on ctags
|
179
|
+
:hypothesis: and it would build a custom language model based on see Tex
|
180
|
+
:insertions: 1
|
181
|
+
:deletions: 0
|
182
|
+
:substitutions: 1
|
183
|
+
:matching: 10
|
184
|
+
:align_cost: 2
|
185
|
+
:transcription_words: 11
|
186
|
+
:aligned_transcription: and it would build a custom language model based on ***
|
187
|
+
CTAGS
|
188
|
+
:aligned_hypothesis: and it would build a custom language model based on SEE
|
189
|
+
TEX
|
190
|
+
- :transcription: so it understands all your classes and methods like words
|
191
|
+
:hypothesis: so it understands Oreo classes and methods like words
|
192
|
+
:insertions: 0
|
193
|
+
:deletions: 1
|
194
|
+
:substitutions: 1
|
195
|
+
:matching: 8
|
196
|
+
:align_cost: 2
|
197
|
+
:transcription_words: 10
|
198
|
+
:aligned_transcription: so it understands ALL YOUR classes and methods like words
|
199
|
+
:aligned_hypothesis: so it understands *** OREO classes and methods like words
|
200
|
+
- :transcription: I think this might be a good idea
|
201
|
+
:hypothesis: I'd think this might be a good idea
|
202
|
+
:insertions: 0
|
203
|
+
:deletions: 0
|
204
|
+
:substitutions: 1
|
205
|
+
:matching: 7
|
206
|
+
:align_cost: 1
|
207
|
+
:transcription_words: 8
|
208
|
+
:aligned_transcription: I think this might be a good idea
|
209
|
+
:aligned_hypothesis: I'D think this might be a good idea
|
210
|
+
- :transcription: I think this should be much faster
|
211
|
+
:hypothesis: I think the should be much faster
|
212
|
+
:insertions: 0
|
213
|
+
:deletions: 0
|
214
|
+
:substitutions: 1
|
215
|
+
:matching: 6
|
216
|
+
:align_cost: 1
|
217
|
+
:transcription_words: 7
|
218
|
+
:aligned_transcription: i think THIS should be much faster
|
219
|
+
:aligned_hypothesis: i think THE should be much faster
|
220
|
+
- :transcription: did you see my latest commit
|
221
|
+
:hypothesis: and opted you ca latest committed up
|
222
|
+
:insertions: 1
|
223
|
+
:deletions: 0
|
224
|
+
:substitutions: 5
|
225
|
+
:matching: 1
|
226
|
+
:align_cost: 6
|
227
|
+
:transcription_words: 6
|
228
|
+
:aligned_transcription: '*** DID you SEE MY LATEST COMMIT'
|
229
|
+
:aligned_hypothesis: AND OPTED you CA LATEST COMMITTED UP
|
230
|
+
- :transcription: click on first result
|
231
|
+
:hypothesis: click on first result
|
232
|
+
:insertions: 0
|
233
|
+
:deletions: 0
|
234
|
+
:substitutions: 0
|
235
|
+
:matching: 4
|
236
|
+
:align_cost: 0
|
237
|
+
:transcription_words: 4
|
238
|
+
:aligned_transcription: click on first result
|
239
|
+
:aligned_hypothesis: click on first result
|
240
|
+
- :transcription: click on second result
|
241
|
+
:hypothesis: click on second free soft
|
242
|
+
:insertions: 1
|
243
|
+
:deletions: 0
|
244
|
+
:substitutions: 1
|
245
|
+
:matching: 3
|
246
|
+
:align_cost: 2
|
247
|
+
:transcription_words: 4
|
248
|
+
:aligned_transcription: click on second *** RESULT
|
249
|
+
:aligned_hypothesis: click on second FREE SOFT
|
250
|
+
- :transcription: go to Google
|
251
|
+
:hypothesis: go to Google on
|
252
|
+
:insertions: 1
|
253
|
+
:deletions: 0
|
254
|
+
:substitutions: 0
|
255
|
+
:matching: 3
|
256
|
+
:align_cost: 1
|
257
|
+
:transcription_words: 3
|
258
|
+
:aligned_transcription: go to google ***
|
259
|
+
:aligned_hypothesis: go to google ON
|
260
|
+
- :transcription: validates presence of name
|
261
|
+
:hypothesis: valid dates presence of name
|
262
|
+
:insertions: 1
|
263
|
+
:deletions: 0
|
264
|
+
:substitutions: 1
|
265
|
+
:matching: 3
|
266
|
+
:align_cost: 2
|
267
|
+
:transcription_words: 4
|
268
|
+
:aligned_transcription: '*** VALIDATES presence of name'
|
269
|
+
:aligned_hypothesis: VALID DATES presence of name
|
270
|
+
- :transcription: validates uniqueness of name
|
271
|
+
:hypothesis: wedded its uniqueness often name
|
272
|
+
:insertions: 1
|
273
|
+
:deletions: 0
|
274
|
+
:substitutions: 2
|
275
|
+
:matching: 2
|
276
|
+
:align_cost: 3
|
277
|
+
:transcription_words: 4
|
278
|
+
:aligned_transcription: '*** VALIDATES uniqueness OF name'
|
279
|
+
:aligned_hypothesis: WEDDED ITS uniqueness OFTEN name
|
280
|
+
- :transcription: belongs to language
|
281
|
+
:hypothesis: belongs to language
|
282
|
+
:insertions: 0
|
283
|
+
:deletions: 0
|
284
|
+
:substitutions: 0
|
285
|
+
:matching: 3
|
286
|
+
:align_cost: 0
|
287
|
+
:transcription_words: 3
|
288
|
+
:aligned_transcription: belongs to language
|
289
|
+
:aligned_hypothesis: belongs to language
|
290
|
+
- :transcription: belongs to user
|
291
|
+
:hypothesis: belongs to user
|
292
|
+
:insertions: 0
|
293
|
+
:deletions: 0
|
294
|
+
:substitutions: 0
|
295
|
+
:matching: 3
|
296
|
+
:align_cost: 0
|
297
|
+
:transcription_words: 3
|
298
|
+
:aligned_transcription: belongs to user
|
299
|
+
:aligned_hypothesis: belongs to user
|
300
|
+
- :transcription: it should have three actions
|
301
|
+
:hypothesis: it should have three actions
|
302
|
+
:insertions: 0
|
303
|
+
:deletions: 0
|
304
|
+
:substitutions: 0
|
305
|
+
:matching: 5
|
306
|
+
:align_cost: 0
|
307
|
+
:transcription_words: 5
|
308
|
+
:aligned_transcription: it should have three actions
|
309
|
+
:aligned_hypothesis: it should have three actions
|
310
|
+
- :transcription: I didn't think this would work
|
311
|
+
:hypothesis: I didn't think this would work
|
312
|
+
:insertions: 0
|
313
|
+
:deletions: 0
|
314
|
+
:substitutions: 0
|
315
|
+
:matching: 6
|
316
|
+
:align_cost: 0
|
317
|
+
:transcription_words: 6
|
318
|
+
:aligned_transcription: i didn't think this would work
|
319
|
+
:aligned_hypothesis: i didn't think this would work
|
320
|
+
- :transcription: I am now testing another recording
|
321
|
+
:hypothesis: I am not testing another recording
|
322
|
+
:insertions: 0
|
323
|
+
:deletions: 0
|
324
|
+
:substitutions: 1
|
325
|
+
:matching: 5
|
326
|
+
:align_cost: 1
|
327
|
+
:transcription_words: 6
|
328
|
+
:aligned_transcription: i am NOW testing another recording
|
329
|
+
:aligned_hypothesis: i am NOT testing another recording
|
330
|
+
- :transcription: hello and welcome
|
331
|
+
:hypothesis: hello and welcome
|
332
|
+
:insertions: 0
|
333
|
+
:deletions: 0
|
334
|
+
:substitutions: 0
|
335
|
+
:matching: 3
|
336
|
+
:align_cost: 0
|
337
|
+
:transcription_words: 3
|
338
|
+
:aligned_transcription: hello and welcome
|
339
|
+
:aligned_hypothesis: hello and welcome
|
340
|
+
- :transcription: by the way everything that you have just read was recognized by
|
341
|
+
my software
|
342
|
+
:hypothesis: by the way everything that you have just read was recognized by my
|
343
|
+
software
|
344
|
+
:insertions: 0
|
345
|
+
:deletions: 0
|
346
|
+
:substitutions: 0
|
347
|
+
:matching: 14
|
348
|
+
:align_cost: 0
|
349
|
+
:transcription_words: 14
|
350
|
+
:aligned_transcription: by the way everything that you have just read was recognized
|
351
|
+
by my software
|
352
|
+
:aligned_hypothesis: by the way everything that you have just read was recognized
|
353
|
+
by my software
|
354
|
+
- :transcription: with only minor errors in the recognition
|
355
|
+
:hypothesis: with only minor errors in the recognition
|
356
|
+
:insertions: 0
|
357
|
+
:deletions: 0
|
358
|
+
:substitutions: 0
|
359
|
+
:matching: 7
|
360
|
+
:align_cost: 0
|
361
|
+
:transcription_words: 7
|
362
|
+
:aligned_transcription: with only minor errors in the recognition
|
363
|
+
:aligned_hypothesis: with only minor errors in the recognition
|
364
|
+
- :transcription: please fetch the files from the server
|
365
|
+
:hypothesis: please search the files from the server
|
366
|
+
:insertions: 0
|
367
|
+
:deletions: 0
|
368
|
+
:substitutions: 1
|
369
|
+
:matching: 6
|
370
|
+
:align_cost: 1
|
371
|
+
:transcription_words: 7
|
372
|
+
:aligned_transcription: please FETCH the files from the server
|
373
|
+
:aligned_hypothesis: please SEARCH the files from the server
|
374
|
+
- :transcription: the real challenge is coming up with a good speech representation
|
375
|
+
of ruby
|
376
|
+
:hypothesis: the real challenge is coming up with Blake good speech representation
|
377
|
+
of rube
|
378
|
+
:insertions: 0
|
379
|
+
:deletions: 0
|
380
|
+
:substitutions: 2
|
381
|
+
:matching: 11
|
382
|
+
:align_cost: 2
|
383
|
+
:transcription_words: 13
|
384
|
+
:aligned_transcription: the real challenge is coming up with A good speech
|
385
|
+
representation of RUBY
|
386
|
+
:aligned_hypothesis: the real challenge is coming up with BLAKE good speech representation
|
387
|
+
of RUBE
|
388
|
+
- :transcription: that follows the principle of least surprise
|
389
|
+
:hypothesis: that follows the principle of the surprise
|
390
|
+
:insertions: 0
|
391
|
+
:deletions: 0
|
392
|
+
:substitutions: 1
|
393
|
+
:matching: 6
|
394
|
+
:align_cost: 1
|
395
|
+
:transcription_words: 7
|
396
|
+
:aligned_transcription: that follows the principle of LEAST surprise
|
397
|
+
:aligned_hypothesis: that follows the principle of THE surprise
|
398
|
+
- :transcription: and deals appropriately with ambiguous cases
|
399
|
+
:hypothesis: and this appropriately it was ambiguous cases
|
400
|
+
:insertions: 1
|
401
|
+
:deletions: 0
|
402
|
+
:substitutions: 2
|
403
|
+
:matching: 4
|
404
|
+
:align_cost: 3
|
405
|
+
:transcription_words: 6
|
406
|
+
:aligned_transcription: and DEALS appropriately *** WITH ambiguous cases
|
407
|
+
:aligned_hypothesis: and THIS appropriately IT WAS ambiguous cases
|
408
|
+
- :transcription: the good thing is that you can get rid of a lot of manual work
|
409
|
+
:hypothesis: the good thing is that you can get rid of a lot of manual work
|
410
|
+
:insertions: 0
|
411
|
+
:deletions: 0
|
412
|
+
:substitutions: 0
|
413
|
+
:matching: 15
|
414
|
+
:align_cost: 0
|
415
|
+
:transcription_words: 15
|
416
|
+
:aligned_transcription: the good thing is that you can get rid of a lot of manual
|
417
|
+
work
|
418
|
+
:aligned_hypothesis: the good thing is that you can get rid of a lot of manual
|
419
|
+
work
|
420
|
+
- :transcription: for example attribute accessors are nearly always placed at the
|
421
|
+
top of the file
|
422
|
+
:hypothesis: for example attribute excesses are nearly always placed at the top
|
423
|
+
of the file
|
424
|
+
:insertions: 0
|
425
|
+
:deletions: 0
|
426
|
+
:substitutions: 1
|
427
|
+
:matching: 13
|
428
|
+
:align_cost: 1
|
429
|
+
:transcription_words: 14
|
430
|
+
:aligned_transcription: for example attribute ACCESSORS are nearly always placed
|
431
|
+
at the top of the file
|
432
|
+
:aligned_hypothesis: for example attribute EXCESSES are nearly always placed at the
|
433
|
+
top of the file
|
434
|
+
- :transcription: so when you say something like
|
435
|
+
:hypothesis: so when you say something like
|
436
|
+
:insertions: 0
|
437
|
+
:deletions: 0
|
438
|
+
:substitutions: 0
|
439
|
+
:matching: 6
|
440
|
+
:align_cost: 0
|
441
|
+
:transcription_words: 6
|
442
|
+
:aligned_transcription: so when you say something like
|
443
|
+
:aligned_hypothesis: so when you say something like
|
444
|
+
- :transcription: attribute accessor file name
|
445
|
+
:hypothesis: attribute access server fine name
|
446
|
+
:insertions: 1
|
447
|
+
:deletions: 0
|
448
|
+
:substitutions: 2
|
449
|
+
:matching: 2
|
450
|
+
:align_cost: 3
|
451
|
+
:transcription_words: 4
|
452
|
+
:aligned_transcription: attribute *** ACCESSOR FILE name
|
453
|
+
:aligned_hypothesis: attribute ACCESS SERVER FINE name
|
454
|
+
- :transcription: it will automatically put the following line at the top of the file
|
455
|
+
in the right place
|
456
|
+
:hypothesis: it will automatically put the following line at the top of the file
|
457
|
+
in the right place
|
458
|
+
:insertions: 0
|
459
|
+
:deletions: 0
|
460
|
+
:substitutions: 0
|
461
|
+
:matching: 17
|
462
|
+
:align_cost: 0
|
463
|
+
:transcription_words: 17
|
464
|
+
:aligned_transcription: it will automatically put the following line at the top
|
465
|
+
of the file in the right place
|
466
|
+
:aligned_hypothesis: it will automatically put the following line at the top of the
|
467
|
+
file in the right place
|
468
|
+
- :transcription: so the whole approach works only with one unified style
|
469
|
+
:hypothesis: so the whole approach works only with one unified style
|
470
|
+
:insertions: 0
|
471
|
+
:deletions: 0
|
472
|
+
:substitutions: 0
|
473
|
+
:matching: 10
|
474
|
+
:align_cost: 0
|
475
|
+
:transcription_words: 10
|
476
|
+
:aligned_transcription: so the whole approach works only with one unified style
|
477
|
+
:aligned_hypothesis: so the whole approach works only with one unified style
|
478
|
+
- :transcription: which is the ruby best practices style which is published on github
|
479
|
+
:hypothesis: which is did ruby best practices style which is published on git help
|
480
|
+
:insertions: 1
|
481
|
+
:deletions: 0
|
482
|
+
:substitutions: 2
|
483
|
+
:matching: 10
|
484
|
+
:align_cost: 3
|
485
|
+
:transcription_words: 12
|
486
|
+
:aligned_transcription: which is THE ruby best practices style which is published
|
487
|
+
on *** GITHUB
|
488
|
+
:aligned_hypothesis: which is DID ruby best practices style which is published
|
489
|
+
on GIT HELP
|
490
|
+
- :transcription: I wonder if I should create a custom language model just for programming
|
491
|
+
:hypothesis: I wonder if I should create a custom language will just for programming
|
492
|
+
:insertions: 0
|
493
|
+
:deletions: 0
|
494
|
+
:substitutions: 1
|
495
|
+
:matching: 12
|
496
|
+
:align_cost: 1
|
497
|
+
:transcription_words: 13
|
498
|
+
:aligned_transcription: i wonder if i should create a custom language MODEL
|
499
|
+
just for programming
|
500
|
+
:aligned_hypothesis: i wonder if i should create a custom language WILL just
|
501
|
+
for programming
|
502
|
+
- :transcription: or if I should use the normal dictation model and just train it
|
503
|
+
for programming
|
504
|
+
:hypothesis: or if I should use that normal dictation on model and just train it
|
505
|
+
for programming
|
506
|
+
:insertions: 1
|
507
|
+
:deletions: 0
|
508
|
+
:substitutions: 1
|
509
|
+
:matching: 14
|
510
|
+
:align_cost: 2
|
511
|
+
:transcription_words: 15
|
512
|
+
:aligned_transcription: or if i should use THE normal dictation *** model and
|
513
|
+
just train it for programming
|
514
|
+
:aligned_hypothesis: or if i should use THAT normal dictation ON model and
|
515
|
+
just train it for programming
|
516
|
+
- :transcription: the advantage is that it would also recognize normal sentences such
|
517
|
+
as commit messages
|
518
|
+
:hypothesis: the advantage is that it would also recognized normal sentences such
|
519
|
+
as commit messages
|
520
|
+
:insertions: 0
|
521
|
+
:deletions: 0
|
522
|
+
:substitutions: 1
|
523
|
+
:matching: 13
|
524
|
+
:align_cost: 1
|
525
|
+
:transcription_words: 14
|
526
|
+
:aligned_transcription: the advantage is that it would also RECOGNIZE normal
|
527
|
+
sentences such as commit messages
|
528
|
+
:aligned_hypothesis: the advantage is that it would also RECOGNIZED normal sentences
|
529
|
+
such as commit messages
|
530
|
+
- :transcription: while the disadvantage is that it would not work as accurate on
|
531
|
+
programming messages
|
532
|
+
:hypothesis: why the disadvantage is that it would not work as a correct on programming
|
533
|
+
messages
|
534
|
+
:insertions: 1
|
535
|
+
:deletions: 0
|
536
|
+
:substitutions: 2
|
537
|
+
:matching: 12
|
538
|
+
:align_cost: 3
|
539
|
+
:transcription_words: 14
|
540
|
+
:aligned_transcription: WHILE the disadvantage is that it would not work as ***
|
541
|
+
ACCURATE on programming messages
|
542
|
+
:aligned_hypothesis: WHY the disadvantage is that it would not work as A CORRECT on programming
|
543
|
+
messages
|
544
|
+
- :transcription: another idea would be a hybrid approach
|
545
|
+
:hypothesis: another idea would be a hybrid approach
|
546
|
+
:insertions: 0
|
547
|
+
:deletions: 0
|
548
|
+
:substitutions: 0
|
549
|
+
:matching: 7
|
550
|
+
:align_cost: 0
|
551
|
+
:transcription_words: 7
|
552
|
+
:aligned_transcription: another idea would be a hybrid approach
|
553
|
+
:aligned_hypothesis: another idea would be a hybrid approach
|
554
|
+
- :transcription: that means whenever you are entering a string value or a commit
|
555
|
+
message it would switch automatically to the dictation language model
|
556
|
+
:hypothesis: that means whenever you are entering any string value or a commit message
|
557
|
+
it would switch automatically to the dictation language more
|
558
|
+
:insertions: 0
|
559
|
+
:deletions: 0
|
560
|
+
:substitutions: 2
|
561
|
+
:matching: 20
|
562
|
+
:align_cost: 2
|
563
|
+
:transcription_words: 22
|
564
|
+
:aligned_transcription: that means whenever you are entering A string value or a commit
|
565
|
+
message it would switch automatically to the dictation language MODEL
|
566
|
+
:aligned_hypothesis: that means whenever you are entering ANY string value or a commit
|
567
|
+
message it would switch automatically to the dictation language MORE
|
568
|
+
- :transcription: I am not sure if Google's voice recognition is actually that good
|
569
|
+
:hypothesis: I am not sure it's Google's voice recognition is actually dead code
|
570
|
+
:insertions: 0
|
571
|
+
:deletions: 0
|
572
|
+
:substitutions: 3
|
573
|
+
:matching: 9
|
574
|
+
:align_cost: 3
|
575
|
+
:transcription_words: 12
|
576
|
+
:aligned_transcription: i am not sure IF google's voice recognition is actually
|
577
|
+
THAT GOOD
|
578
|
+
:aligned_hypothesis: i am not sure IT'S google's voice recognition is actually
|
579
|
+
DEAD CODE
|
580
|
+
- :transcription: we can wait no problem
|
581
|
+
:hypothesis: we can wait no problem
|
582
|
+
:insertions: 0
|
583
|
+
:deletions: 0
|
584
|
+
:substitutions: 0
|
585
|
+
:matching: 5
|
586
|
+
:align_cost: 0
|
587
|
+
:transcription_words: 5
|
588
|
+
:aligned_transcription: we can wait no problem
|
589
|
+
:aligned_hypothesis: we can wait no problem
|
590
|
+
- :transcription: wow it's really fast
|
591
|
+
:hypothesis: wow it's really fast
|
592
|
+
:insertions: 0
|
593
|
+
:deletions: 0
|
594
|
+
:substitutions: 0
|
595
|
+
:matching: 4
|
596
|
+
:align_cost: 0
|
597
|
+
:transcription_words: 4
|
598
|
+
:aligned_transcription: wow it's really fast
|
599
|
+
:aligned_hypothesis: wow it's really fast
|
600
|
+
- :transcription: the URL is different
|
601
|
+
:hypothesis: do you are at it is different
|
602
|
+
:insertions: 3
|
603
|
+
:deletions: 0
|
604
|
+
:substitutions: 2
|
605
|
+
:matching: 2
|
606
|
+
:align_cost: 5
|
607
|
+
:transcription_words: 4
|
608
|
+
:aligned_transcription: '*** *** *** THE URL is different'
|
609
|
+
:aligned_hypothesis: DO YOU ARE AT IT is different
|
610
|
+
- :transcription: we would basically just need to change that
|
611
|
+
:hypothesis: we would basically just need to change that
|
612
|
+
:insertions: 0
|
613
|
+
:deletions: 0
|
614
|
+
:substitutions: 0
|
615
|
+
:matching: 8
|
616
|
+
:align_cost: 0
|
617
|
+
:transcription_words: 8
|
618
|
+
:aligned_transcription: we would basically just need to change that
|
619
|
+
:aligned_hypothesis: we would basically just need to change that
|
620
|
+
- :transcription: logos are symbols that attempt to visually represent the essence
|
621
|
+
of an organization
|
622
|
+
:hypothesis: logos are symbols that attempt to visually represent the essence of
|
623
|
+
an organization
|
624
|
+
:insertions: 0
|
625
|
+
:deletions: 0
|
626
|
+
:substitutions: 0
|
627
|
+
:matching: 13
|
628
|
+
:align_cost: 0
|
629
|
+
:transcription_words: 13
|
630
|
+
:aligned_transcription: logos are symbols that attempt to visually represent the
|
631
|
+
essence of an organization
|
632
|
+
:aligned_hypothesis: logos are symbols that attempt to visually represent the essence
|
633
|
+
of an organization
|
634
|
+
- :transcription: given that the new yahoo logo is a blandly cooperate humourless
|
635
|
+
confused jumble of unappealing elements
|
636
|
+
:hypothesis: given that the new yahoo rule is a plan to cooperate to over less confused
|
637
|
+
jumble of an unappealing elements
|
638
|
+
:insertions: 4
|
639
|
+
:deletions: 0
|
640
|
+
:substitutions: 3
|
641
|
+
:matching: 13
|
642
|
+
:align_cost: 7
|
643
|
+
:transcription_words: 16
|
644
|
+
:aligned_transcription: given that the new yahoo LOGO is a *** BLANDLY cooperate
|
645
|
+
*** *** HUMOURLESS confused jumble of *** unappealing elements
|
646
|
+
:aligned_hypothesis: given that the new yahoo RULE is a PLAN TO cooperate
|
647
|
+
TO OVER LESS confused jumble of AN unappealing elements
|