alignment 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +1 -0
- data/Manifest +8 -0
- data/README.rdoc +26 -0
- data/Rakefile +16 -0
- data/VERSION +1 -0
- data/alignment.gemspec +47 -0
- data/ext/gale_church/.gitignore +3 -0
- data/ext/gale_church/extconf.rb +5 -0
- data/ext/gale_church/gale_church.c +411 -0
- data/lib/.gitignore +1 -0
- data/lib/alignment.rb +86 -0
- data/test/test_alignment.rb +41 -0
- metadata +76 -0
data/CHANGELOG
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
v0.1.0. initial release
|
data/Manifest
ADDED
data/README.rdoc
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
= alignment
|
2
|
+
|
3
|
+
Alignment functions for corpus linguistics. Currently implemented
|
4
|
+
is the Gale-Church alignment algorithm.
|
5
|
+
|
6
|
+
== Installation
|
7
|
+
|
8
|
+
gem install alignment
|
9
|
+
|
10
|
+
== License
|
11
|
+
|
12
|
+
GPL2
|
13
|
+
|
14
|
+
The Gale-Church implementation was derived from NATools.
|
15
|
+
Implementation is copyright Pernilla Danielsson, Daniel Ridings,
|
16
|
+
and Alberto Simões. Algorithm is copyright William Gale and
|
17
|
+
Kenneth Church.
|
18
|
+
|
19
|
+
Ruby version is copyright Marius L. Jøhndal.
|
20
|
+
|
21
|
+
== Usage
|
22
|
+
|
23
|
+
|
24
|
+
== Development
|
25
|
+
|
26
|
+
The project is hosted on github on http://github.com/mlj/alignment.
|
data/Rakefile
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
begin
|
3
|
+
require 'jeweler'
|
4
|
+
|
5
|
+
Jeweler::Tasks.new do |p|
|
6
|
+
p.name = "alignment"
|
7
|
+
p.summary = "Alignment functions for corpus linguistics"
|
8
|
+
p.description = "Alignment functions for corpus linguistics."
|
9
|
+
p.authors = ['Marius L. Jøhndal']
|
10
|
+
p.email = "mariuslj (at) ifi [dot] uio (dot) no"
|
11
|
+
p.homepage = "http://github.com/mlj/alignment"
|
12
|
+
p.rubyforge_project = "alignment"
|
13
|
+
end
|
14
|
+
rescue LoadError
|
15
|
+
puts "Jeweler not available. Install it with: sudo gem install jeweler"
|
16
|
+
end
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
0.2.0
|
data/alignment.gemspec
ADDED
@@ -0,0 +1,47 @@
|
|
1
|
+
# Generated by jeweler
|
2
|
+
# DO NOT EDIT THIS FILE DIRECTLY
|
3
|
+
# Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
|
4
|
+
# -*- encoding: utf-8 -*-
|
5
|
+
|
6
|
+
Gem::Specification.new do |s|
|
7
|
+
s.name = "alignment"
|
8
|
+
s.version = "0.2.0"
|
9
|
+
|
10
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
|
+
s.authors = ["Marius L. J\303\270hndal"]
|
12
|
+
s.date = "2011-12-03"
|
13
|
+
s.description = "Alignment functions for corpus linguistics."
|
14
|
+
s.email = "mariuslj (at) ifi [dot] uio (dot) no"
|
15
|
+
s.extensions = ["ext/gale_church/extconf.rb"]
|
16
|
+
s.extra_rdoc_files = [
|
17
|
+
"README.rdoc"
|
18
|
+
]
|
19
|
+
s.files = [
|
20
|
+
"CHANGELOG",
|
21
|
+
"Manifest",
|
22
|
+
"README.rdoc",
|
23
|
+
"Rakefile",
|
24
|
+
"VERSION",
|
25
|
+
"ext/gale_church/.gitignore",
|
26
|
+
"ext/gale_church/extconf.rb",
|
27
|
+
"ext/gale_church/gale_church.c",
|
28
|
+
"lib/.gitignore",
|
29
|
+
"lib/alignment.rb",
|
30
|
+
"test/test_alignment.rb"
|
31
|
+
]
|
32
|
+
s.homepage = "http://github.com/mlj/alignment"
|
33
|
+
s.require_paths = ["lib"]
|
34
|
+
s.rubyforge_project = "alignment"
|
35
|
+
s.rubygems_version = "1.8.11"
|
36
|
+
s.summary = "Alignment functions for corpus linguistics"
|
37
|
+
|
38
|
+
if s.respond_to? :specification_version then
|
39
|
+
s.specification_version = 3
|
40
|
+
|
41
|
+
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
42
|
+
else
|
43
|
+
end
|
44
|
+
else
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
@@ -0,0 +1,411 @@
|
|
1
|
+
/* NATools - Package with parallel corpora tools
|
2
|
+
* Original Implementation Copyright:
|
3
|
+
* Pernilla Danielsson and Daniel Ridings
|
4
|
+
* Algorithm Copyright:
|
5
|
+
* William Gale and Kenneth Church
|
6
|
+
* Copyright (C) 2002-2004 Alberto Sim�es
|
7
|
+
|
8
|
+
* Ruby adaptation:
|
9
|
+
* Copyright (C) 2008 Marius L. J�hndal
|
10
|
+
*
|
11
|
+
* This package is free software; you can redistribute it and/or
|
12
|
+
* modify it under the terms of the GNU Lesser General Public
|
13
|
+
* License as published by the Free Software Foundation; either
|
14
|
+
* version 2 of the License, or (at your option) any later version.
|
15
|
+
*
|
16
|
+
* This library is distributed in the hope that it will be useful,
|
17
|
+
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
18
|
+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
19
|
+
* Lesser General Public License for more details.
|
20
|
+
*
|
21
|
+
* You should have received a copy of the GNU Lesser General Public
|
22
|
+
* License along with this library; if not, write to the
|
23
|
+
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
24
|
+
* Boston, MA 02111-1307, USA.
|
25
|
+
*/
|
26
|
+
|
27
|
+
#include "ruby.h"
|
28
|
+
#include <assert.h>
|
29
|
+
#include <math.h>
|
30
|
+
#include <stdio.h>
|
31
|
+
#include <fcntl.h>
|
32
|
+
#include <stdlib.h>
|
33
|
+
#include <unistd.h>
|
34
|
+
#include <string.h>
|
35
|
+
|
36
|
+
#ifndef RSTRING_PTR /* Ruby 1.8 compatibility */
|
37
|
+
#define RARRAY_LEN(ptr) RARRAY(ptr)->len
|
38
|
+
#endif
|
39
|
+
|
40
|
+
VALUE mLogos = Qnil;
|
41
|
+
VALUE mGaleChurch = Qnil;
|
42
|
+
|
43
|
+
void Init_gale_church();
|
44
|
+
|
45
|
+
VALUE method_align(VALUE self, VALUE len1, VALUE len2);
|
46
|
+
|
47
|
+
struct alignment {
|
48
|
+
int x1;
|
49
|
+
int y1;
|
50
|
+
int x2;
|
51
|
+
int y2;
|
52
|
+
int d;
|
53
|
+
};
|
54
|
+
|
55
|
+
static int two_side_distance(int x1, int y1, int x2, int y2);
|
56
|
+
static int seq_align(int *x, int *y, int nx, int ny, int (*dist_funct)(),
|
57
|
+
struct alignment **align);
|
58
|
+
|
59
|
+
void Init_gale_church()
|
60
|
+
{
|
61
|
+
mLogos = rb_define_module("Alignment");
|
62
|
+
mGaleChurch = rb_define_module_under(mLogos, "GaleChurch");
|
63
|
+
rb_define_module_function(mGaleChurch, "align", method_align, 2);
|
64
|
+
}
|
65
|
+
|
66
|
+
VALUE method_align(VALUE self, VALUE len1, VALUE len2)
|
67
|
+
{
|
68
|
+
VALUE ret, res1, res2, grouped_regions1, grouped_regions2;
|
69
|
+
|
70
|
+
int n, i, ix, iy, prevx, prevy;
|
71
|
+
struct alignment *align, *a;
|
72
|
+
int *len1_x;
|
73
|
+
int *len2_x;
|
74
|
+
|
75
|
+
res1 = rb_ary_new();
|
76
|
+
res2 = rb_ary_new();
|
77
|
+
|
78
|
+
len1_x = (int *)malloc(RARRAY_LEN(len1) * sizeof(int));
|
79
|
+
len2_x = (int *)malloc(RARRAY_LEN(len2) * sizeof(int));
|
80
|
+
|
81
|
+
for (i = 0; i < RARRAY_LEN(len1); i++)
|
82
|
+
len1_x[i] = NUM2INT(rb_ary_entry(len1, i));
|
83
|
+
|
84
|
+
for (i = 0; i < RARRAY_LEN(len2); i++)
|
85
|
+
len2_x[i] = NUM2INT(rb_ary_entry(len2, i));
|
86
|
+
|
87
|
+
n = seq_align(len1_x, len2_x, RARRAY_LEN(len1), RARRAY_LEN(len2),
|
88
|
+
two_side_distance, &align);
|
89
|
+
|
90
|
+
free(len1_x);
|
91
|
+
free(len2_x);
|
92
|
+
|
93
|
+
prevx = prevy = ix = iy = 0;
|
94
|
+
|
95
|
+
for (i = 0; i < n; i++) {
|
96
|
+
a = &align[i];
|
97
|
+
|
98
|
+
if (a->x2 > 0)
|
99
|
+
ix++;
|
100
|
+
else if (a->x1 == 0)
|
101
|
+
ix--;
|
102
|
+
|
103
|
+
if (a->y2 > 0)
|
104
|
+
iy++;
|
105
|
+
else if (a->y1 == 0)
|
106
|
+
iy--;
|
107
|
+
|
108
|
+
if (a->x1 == 0 && a->y1 == 0 && a->x2 == 0 && a->y2 == 0) {
|
109
|
+
ix++;
|
110
|
+
iy++;
|
111
|
+
}
|
112
|
+
|
113
|
+
ix++;
|
114
|
+
iy++;
|
115
|
+
|
116
|
+
grouped_regions1 = rb_ary_new();
|
117
|
+
for (; prevx < ix; prevx++)
|
118
|
+
rb_ary_push(grouped_regions1, INT2FIX(prevx));
|
119
|
+
rb_ary_push(res1, grouped_regions1);
|
120
|
+
|
121
|
+
grouped_regions2 = rb_ary_new();
|
122
|
+
for (; prevy < iy; prevy++)
|
123
|
+
rb_ary_push(grouped_regions2, INT2FIX(prevy));
|
124
|
+
rb_ary_push(res2, grouped_regions2);
|
125
|
+
}
|
126
|
+
|
127
|
+
free(align);
|
128
|
+
|
129
|
+
ret = rb_ary_new();
|
130
|
+
rb_ary_push(ret, res1);
|
131
|
+
rb_ary_push(ret, res2);
|
132
|
+
|
133
|
+
return ret;
|
134
|
+
}
|
135
|
+
|
136
|
+
/********************************************************************/
|
137
|
+
|
138
|
+
/**
|
139
|
+
* @file
|
140
|
+
* @brief Sentence-aligner main program
|
141
|
+
*
|
142
|
+
* Sentence-aligner software heavily based on Pernilla Danielsson and
|
143
|
+
* Daniel Ridings implementation of William Gale and Kenneth Church
|
144
|
+
* sentence aligner algorithm
|
145
|
+
*
|
146
|
+
* The compiled program is used as
|
147
|
+
* <pre>
|
148
|
+
* nat-sentence-aligner -D '.PARA' -d '.End of Sentence' file1 file2
|
149
|
+
* </pre>
|
150
|
+
* where both <i>file1</i> and <i>file2</i> are tokenized and with a
|
151
|
+
* token per line. In the example, '.PARA' is the hard delimiter,
|
152
|
+
* and '.End of Sentence' the soft delimiter. They are considered as
|
153
|
+
* single tokens, and as such, should appear in a line by themselves.
|
154
|
+
*
|
155
|
+
* The program is allowed to delete soft delimiters as necessary in
|
156
|
+
* order to align the files, but it cannot change hard delimiters.
|
157
|
+
*
|
158
|
+
* The output will be written in two files: file1.al and file2.al
|
159
|
+
* where each one contains aligned sentences.
|
160
|
+
*
|
161
|
+
* For debugging it can be useful to output the data in just one file.
|
162
|
+
* In this case, use the '-s' switch and just the 'file1.al' will be
|
163
|
+
* created.
|
164
|
+
*
|
165
|
+
* @todo Check if we want to document all the functions
|
166
|
+
*/
|
167
|
+
|
168
|
+
|
169
|
+
#define dist(x,y) distances[(x) * ((ny) + 1) + (y)]
|
170
|
+
#define pathx(x,y) path_x[(x) * ((ny) + 1) + (y)]
|
171
|
+
#define pathy(x,y) path_y[(x) * ((ny) + 1) + (y)]
|
172
|
+
|
173
|
+
/**
|
174
|
+
* @brief Contant representing a big align distance between two
|
175
|
+
* sentences
|
176
|
+
*/
|
177
|
+
#define BIG_DISTANCE 2500
|
178
|
+
|
179
|
+
/*
|
180
|
+
|
181
|
+
seq_align by Mike Riley
|
182
|
+
Sequence alignment routine.
|
183
|
+
This version allows for contraction/expansions.
|
184
|
+
|
185
|
+
x and y are sequences of objects, represented as non-zero ints, to be aligned.
|
186
|
+
|
187
|
+
dist_funct(x1, y1, x2, y2) is a distance function of 4 args:
|
188
|
+
|
189
|
+
dist_funct(x1, y1, 0, 0) gives cost of substitution of x1 by y1.
|
190
|
+
dist_funct(x1, 0, 0, 0) gives cost of deletion of x1.
|
191
|
+
dist_funct(0, y1, 0, 0) gives cost of insertion of y1.
|
192
|
+
dist_funct(x1, y1, x2, 0) gives cost of contraction of (x1,x2) to y1.
|
193
|
+
dist_funct(x1, y1, 0, y2) gives cost of expansion of x1 to (y1,y2).
|
194
|
+
dist_funct(x1, y1, x2, y2) gives cost to match (x1,x2) to (y1,y2).
|
195
|
+
|
196
|
+
align is the alignment, with (align[i].x1, align[i].x2) aligned
|
197
|
+
with (align[i].y1, align[i].y2). Zero in align[].x1 and align[].y1
|
198
|
+
correspond to insertion and deletion, respectively. Non-zero in
|
199
|
+
align[].x2 and align[].y2 correspond to contraction and expansion,
|
200
|
+
respectively. align[].d gives the distance for that pairing.
|
201
|
+
|
202
|
+
The function returns the length of the alignment.
|
203
|
+
|
204
|
+
*/
|
205
|
+
|
206
|
+
static int seq_align(int *x, int *y, int nx, int ny, int (*dist_funct)(),
|
207
|
+
struct alignment **align)
|
208
|
+
{
|
209
|
+
int *distances, *path_x, *path_y, n;
|
210
|
+
int i, j, oi, oj, di, dj, d1, d2, d3, d4, d5, d6, dmin;
|
211
|
+
struct alignment *ralign;
|
212
|
+
|
213
|
+
distances = (int *) malloc((nx + 1) * (ny + 1) * sizeof(int));
|
214
|
+
path_x = (int *) malloc((nx + 1) * (ny + 1) * sizeof(int));
|
215
|
+
path_y = (int *) malloc((nx + 1) * (ny + 1) * sizeof(int));
|
216
|
+
ralign = (struct alignment *) malloc((nx + ny) * sizeof(struct alignment));
|
217
|
+
|
218
|
+
for (j = 0; j <= ny; j++) {
|
219
|
+
for (i = 0; i <= nx; i++) {
|
220
|
+
d1 = i>0 && j>0 ? /* substitution */
|
221
|
+
dist(i-1, j-1) + (*dist_funct)(x[i-1], y[j-1], 0, 0)
|
222
|
+
: INT_MAX;
|
223
|
+
d2 = i>0 ? /* deletion */
|
224
|
+
dist(i-1, j) + (*dist_funct)(x[i-1], 0, 0, 0)
|
225
|
+
: INT_MAX;
|
226
|
+
d3 = j>0 ? /* insertion */
|
227
|
+
dist(i, j-1) + (*dist_funct)(0, y[j-1], 0, 0)
|
228
|
+
: INT_MAX;
|
229
|
+
d4 = i>1 && j>0 ? /* contraction */
|
230
|
+
dist(i-2, j-1) + (*dist_funct)(x[i-2], y[j-1], x[i-1], 0)
|
231
|
+
: INT_MAX;
|
232
|
+
d5 = i>0 && j>1 ? /* expansion */
|
233
|
+
dist(i-1, j-2) + (*dist_funct)(x[i-1], y[j-2], 0, y[j-1])
|
234
|
+
: INT_MAX;
|
235
|
+
d6 = i>1 && j>1 ? /* melding */
|
236
|
+
dist(i-2, j-2) + (*dist_funct)(x[i-2], y[j-2], x[i-1], y[j-1])
|
237
|
+
: INT_MAX;
|
238
|
+
|
239
|
+
dmin = d1;
|
240
|
+
if (d2<dmin) dmin=d2;
|
241
|
+
if (d3<dmin) dmin=d3;
|
242
|
+
if (d4<dmin) dmin=d4;
|
243
|
+
if (d5<dmin) dmin=d5;
|
244
|
+
if (d6<dmin) dmin=d6;
|
245
|
+
|
246
|
+
if (dmin == INT_MAX) {
|
247
|
+
dist(i,j) = 0;
|
248
|
+
}
|
249
|
+
else if (dmin == d1) {
|
250
|
+
dist(i,j) = d1;
|
251
|
+
pathx(i,j) = i-1;
|
252
|
+
pathy(i,j) = j-1;
|
253
|
+
}
|
254
|
+
else if (dmin == d2) {
|
255
|
+
dist(i,j) = d2;
|
256
|
+
pathx(i,j) = i-1;
|
257
|
+
pathy(i,j) = j;
|
258
|
+
}
|
259
|
+
else if (dmin == d3) {
|
260
|
+
dist(i,j) = d3;
|
261
|
+
pathx(i,j) = i;
|
262
|
+
pathy(i,j) = j-1;
|
263
|
+
}
|
264
|
+
else if (dmin == d4) {
|
265
|
+
dist(i,j) = d4;
|
266
|
+
pathx(i,j) = i-2;
|
267
|
+
pathy(i,j) = j-1;
|
268
|
+
}
|
269
|
+
else if (dmin == d5){
|
270
|
+
dist(i,j) = d5;
|
271
|
+
pathx(i,j) = i-1;
|
272
|
+
pathy(i,j) = j-2;
|
273
|
+
}
|
274
|
+
else /* dmin == d6 */ {
|
275
|
+
dist(i,j) = d6;
|
276
|
+
pathx(i,j) = i-2;
|
277
|
+
pathy(i,j) = j-2;
|
278
|
+
}
|
279
|
+
}
|
280
|
+
}
|
281
|
+
|
282
|
+
n = 0;
|
283
|
+
for (i=nx, j=ny ; i>0 || j>0 ; i = oi, j = oj) {
|
284
|
+
oi = pathx(i, j);
|
285
|
+
oj = pathy(i, j);
|
286
|
+
di = i - oi;
|
287
|
+
dj = j - oj;
|
288
|
+
|
289
|
+
if (di == 1 && dj == 1) { /* substitution */
|
290
|
+
ralign[n].x1 = x[i-1];
|
291
|
+
ralign[n].y1 = y[j-1];
|
292
|
+
ralign[n].x2 = 0;
|
293
|
+
ralign[n].y2 = 0;
|
294
|
+
ralign[n++].d = dist(i, j) - dist(i-1, j-1);
|
295
|
+
}
|
296
|
+
|
297
|
+
else if (di == 1 && dj == 0) { /* deletion */
|
298
|
+
ralign[n].x1 = x[i-1];
|
299
|
+
ralign[n].y1 = 0;
|
300
|
+
ralign[n].x2 = 0;
|
301
|
+
ralign[n].y2 = 0;
|
302
|
+
ralign[n++].d = dist(i, j) - dist(i-1, j);
|
303
|
+
}
|
304
|
+
|
305
|
+
else if (di == 0 && dj == 1) { /* insertion */
|
306
|
+
ralign[n].x1 = 0;
|
307
|
+
ralign[n].y1 = y[j-1];
|
308
|
+
ralign[n].x2 = 0;
|
309
|
+
ralign[n].y2 = 0;
|
310
|
+
ralign[n++].d = dist(i, j) - dist(i, j-1);
|
311
|
+
}
|
312
|
+
|
313
|
+
else if (dj == 1) { /* contraction */
|
314
|
+
ralign[n].x1 = x[i-2];
|
315
|
+
ralign[n].y1 = y[j-1];
|
316
|
+
ralign[n].x2 = x[i-1];
|
317
|
+
ralign[n].y2 = 0;
|
318
|
+
ralign[n++].d = dist(i, j) - dist(i-2, j-1);
|
319
|
+
}
|
320
|
+
|
321
|
+
else if (di == 1) { /* expansion */
|
322
|
+
ralign[n].x1 = x[i-1];
|
323
|
+
ralign[n].y1 = y[j-2];
|
324
|
+
ralign[n].x2 = 0;
|
325
|
+
ralign[n].y2 = y[j-1];
|
326
|
+
ralign[n++].d = dist(i, j) - dist(i-1, j-2);
|
327
|
+
}
|
328
|
+
else /* di == 2 && dj == 2 */ { /* melding */
|
329
|
+
ralign[n].x1 = x[i-2];
|
330
|
+
ralign[n].y1 = y[j-2];
|
331
|
+
ralign[n].x2 = x[i-1];
|
332
|
+
ralign[n].y2 = y[j-1];
|
333
|
+
ralign[n++].d = dist(i, j) - dist(i-2, j-2);
|
334
|
+
}
|
335
|
+
}
|
336
|
+
|
337
|
+
*align = (struct alignment *) malloc(n * sizeof(struct alignment));
|
338
|
+
|
339
|
+
for (i=0; i<n; i++)
|
340
|
+
bcopy(ralign + i, (*align) + (n-i-1), sizeof(struct alignment));
|
341
|
+
|
342
|
+
free(distances);
|
343
|
+
free(path_x);
|
344
|
+
free(path_y);
|
345
|
+
free(ralign);
|
346
|
+
return(n);
|
347
|
+
}
|
348
|
+
|
349
|
+
/* Returns the area under a normal distribution
|
350
|
+
from -inf to z standard deviations */
|
351
|
+
static double pnorm(double z)
|
352
|
+
{
|
353
|
+
double t, pd;
|
354
|
+
t = 1/(1 + 0.2316419 * z);
|
355
|
+
pd = 1 - 0.3989423 *
|
356
|
+
exp(-z * z/2) *
|
357
|
+
((((1.330274429 * t - 1.821255978) * t
|
358
|
+
+ 1.781477937) * t - 0.356563782) * t + 0.319381530) * t;
|
359
|
+
/* see Gradsteyn & Rhyzik, 26.2.17 p932 */
|
360
|
+
return(pd);
|
361
|
+
}
|
362
|
+
|
363
|
+
|
364
|
+
/* Return -100 * log probability that an English sentence of length
|
365
|
+
len1 is a translation of a foreign sentence of length len2. The
|
366
|
+
probability is based on two parameters, the mean and variance of
|
367
|
+
number of foreign characters per English character.
|
368
|
+
*/
|
369
|
+
|
370
|
+
static int match(int len1,int len2)
|
371
|
+
{
|
372
|
+
double z, pd, mean;
|
373
|
+
|
374
|
+
/* foreign characters per english character */
|
375
|
+
double foreign_chars_per_eng_char = 1;
|
376
|
+
|
377
|
+
/* variance per english character */
|
378
|
+
double var_per_eng_char = 6.8 ;
|
379
|
+
|
380
|
+
if (len1==0 && len2==0) return(0);
|
381
|
+
mean = (len1 + len2/foreign_chars_per_eng_char)/2;
|
382
|
+
z = (foreign_chars_per_eng_char * len1 - len2)/sqrt(var_per_eng_char * mean);
|
383
|
+
|
384
|
+
/* Need to deal with both sides of the normal distribution */
|
385
|
+
if (z < 0) z = -z;
|
386
|
+
pd = 2 * (1 - pnorm(z));
|
387
|
+
|
388
|
+
if (pd > 0) return((int)(-100 * log(pd)));
|
389
|
+
else return(BIG_DISTANCE);
|
390
|
+
}
|
391
|
+
|
392
|
+
static int two_side_distance(int x1, int y1, int x2, int y2)
|
393
|
+
{
|
394
|
+
int penalty21 = 230; /* -100 * log([prob of 2-1 match] / [prob of 1-1 match]) */
|
395
|
+
int penalty22 = 440; /* -100 * log([prob of 2-2 match] / [prob of 1-1 match]) */
|
396
|
+
int penalty01 = 450; /* -100 * log([prob of 0-1 match] / [prob of 1-1 match]) */
|
397
|
+
|
398
|
+
if (x2 == 0 && y2 == 0)
|
399
|
+
if (x1 == 0) /* insertion */
|
400
|
+
return(match(x1, y1) + penalty01);
|
401
|
+
else if (y1 == 0) /* deletion */
|
402
|
+
return(match(x1, y1) + penalty01);
|
403
|
+
else
|
404
|
+
return (match(x1, y1)); /* substitution */
|
405
|
+
else if (x2 == 0) /* expansion */
|
406
|
+
return (match(x1, y1 + y2) + penalty21);
|
407
|
+
else if (y2 == 0) /* contraction */
|
408
|
+
return(match(x1 + x2, y1) + penalty21);
|
409
|
+
else /* melding */
|
410
|
+
return(match(x1 + x2, y1 + y2) + penalty22);
|
411
|
+
}
|
data/lib/.gitignore
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
*.so
|
data/lib/alignment.rb
ADDED
@@ -0,0 +1,86 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# alignment.rb - Alignment functions
|
4
|
+
#
|
5
|
+
# Written by Marius L. Jøhndal <mariuslj at ifi.uio.no>, 2008
|
6
|
+
#
|
7
|
+
require 'gale_church'
|
8
|
+
|
9
|
+
module Alignment
|
10
|
+
def self.is_alignable?(o)
|
11
|
+
o.responds_to?(:weight)
|
12
|
+
end
|
13
|
+
|
14
|
+
# Represents an alignment of two regions, one +left+ region and one
|
15
|
+
# +right+ region. Both regions are arrays of alignable objects.
|
16
|
+
AlignedRegions = Struct.new(:left, :right)
|
17
|
+
|
18
|
+
class AlignedRegions
|
19
|
+
def to_s
|
20
|
+
"<#{self.left},#{self.right}>"
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
public
|
25
|
+
|
26
|
+
# Aligns two arrays of regions +a+ and +b+. Returns an array of
|
27
|
+
# +AlignedRegions+.
|
28
|
+
def self.align_regions(a, b, method = :gale_church)
|
29
|
+
case method
|
30
|
+
when :gale_church
|
31
|
+
x, y = Alignment::GaleChurch::align(a.collect(&:weight), b.collect(&:weight))
|
32
|
+
raise "Error aligning regions: returned block count does not match" unless x.length == y.length
|
33
|
+
|
34
|
+
x.zip(y).collect { |r, s| AlignedRegions.new(r.collect { |i| a[i] }, s.collect { |i| b[i] }) }
|
35
|
+
else
|
36
|
+
raise ArgumentError.new("invalid method #{method}")
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
class AlignableText
|
41
|
+
attr_reader :text
|
42
|
+
|
43
|
+
def initialize(text)
|
44
|
+
@text = text
|
45
|
+
end
|
46
|
+
|
47
|
+
def weight
|
48
|
+
# FIXME: character length
|
49
|
+
@text.length
|
50
|
+
# FIXME: word count
|
51
|
+
@text.split(' ').length
|
52
|
+
end
|
53
|
+
|
54
|
+
def to_s
|
55
|
+
@text
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
ANCHOR_REGEXP = /\s*\|\|\s*/
|
60
|
+
BOUNDARY_REGEXP = /\s*\|\s*/
|
61
|
+
|
62
|
+
# Aligns to strings of text +a+ and +b+ using method +method+. The boundaries of the
|
63
|
+
# blocks to be aligned are denoted by a | character (with or without surrounding
|
64
|
+
# white-space). The sequence || denotes an anchor (or hard delimiter), i.e. a fixed
|
65
|
+
# synchronisation point. There must be the same number of anchors in both strings,
|
66
|
+
# but the number of boundaries may be different. The returned value is an array
|
67
|
+
# of pairs of aligned strings.
|
68
|
+
def self.align_text(a, b, method = :gale_church)
|
69
|
+
result = []
|
70
|
+
|
71
|
+
regions_a = a.split(ANCHOR_REGEXP)
|
72
|
+
regions_b = b.split(ANCHOR_REGEXP)
|
73
|
+
|
74
|
+
raise ArgumentError.new("different number of anchors in strings") unless regions_a.length == regions_b.length
|
75
|
+
|
76
|
+
regions_a.zip(regions_b).each do |x, y|
|
77
|
+
r = align_regions(x.split(BOUNDARY_REGEXP).collect { |region| AlignableText.new(region) },
|
78
|
+
y.split(BOUNDARY_REGEXP).collect { |region| AlignableText.new(region) },
|
79
|
+
method)
|
80
|
+
r.collect! { |i| [ i.left.collect(&:text).join(' '), i.right.collect(&:text).join(' ') ] }
|
81
|
+
result += r
|
82
|
+
end
|
83
|
+
|
84
|
+
result
|
85
|
+
end
|
86
|
+
end
|
@@ -0,0 +1,41 @@
|
|
1
|
+
require 'alignment'
|
2
|
+
require 'test/unit'
|
3
|
+
|
4
|
+
class AlignmentTestCase < Test::Unit::TestCase
|
5
|
+
def test_church_alignment
|
6
|
+
x = [ Alignment::AlignableText.new("foo") ]
|
7
|
+
y = [ Alignment::AlignableText.new("bar") ]
|
8
|
+
assert_equal "<foo,bar>", Alignment::align_regions(x, y).collect(&:to_s).join(",")
|
9
|
+
|
10
|
+
x = [ Alignment::AlignableText.new("foo") ]
|
11
|
+
y = [ Alignment::AlignableText.new("bar"), Alignment::AlignableText.new("baz") ]
|
12
|
+
assert_equal "<foo,barbaz>", Alignment::align_regions(x, y).collect(&:to_s).join(",")
|
13
|
+
|
14
|
+
x = [ ]
|
15
|
+
y = [ Alignment::AlignableText.new("bar") ]
|
16
|
+
assert_equal "<,bar>", Alignment::align_regions(x, y).collect(&:to_s).join(",")
|
17
|
+
|
18
|
+
x = [ ]
|
19
|
+
y = [ ]
|
20
|
+
assert_equal "", Alignment::align_regions(x, y).collect(&:to_s).join(",")
|
21
|
+
|
22
|
+
x = [ Alignment::AlignableText.new("foo"), Alignment::AlignableText.new("koo") ]
|
23
|
+
y = [ Alignment::AlignableText.new("bar"), Alignment::AlignableText.new("baz") ]
|
24
|
+
assert_equal "<foo,bar>,<koo,baz>", Alignment::align_regions(x, y).collect(&:to_s).join(",")
|
25
|
+
end
|
26
|
+
|
27
|
+
def test_church_text_alignment
|
28
|
+
assert_equal [["The quick brown fox", "Den kvikke brune reven"], ["jumps", "hopper"], ["over the lazy dog", "over den trege hunden"]],
|
29
|
+
Alignment::align_text("The quick brown fox | jumps | over the lazy dog",
|
30
|
+
"Den kvikke brune reven | hopper | over den trege hunden")
|
31
|
+
|
32
|
+
assert_equal [["The quick brown fox", "Den kvikke brune reven"], ["jumps", "hopper elegant"], ["over the lazy dog", "over den trege hunden"]],
|
33
|
+
Alignment::align_text("The quick brown fox | jumps | over the lazy dog",
|
34
|
+
"Den kvikke brune reven | hopper elegant | over den trege hunden")
|
35
|
+
|
36
|
+
assert_equal [["The quick brown fox", "Den kvikke brune reven"], ["jumps", "hopper"], ["over the lazy dog", "elegant over den trege hunden"]],
|
37
|
+
Alignment::align_text("The quick brown fox | jumps || over the lazy dog",
|
38
|
+
"Den kvikke brune reven | hopper || elegant | over den trege hunden")
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
metadata
ADDED
@@ -0,0 +1,76 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: alignment
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
hash: 23
|
5
|
+
prerelease:
|
6
|
+
segments:
|
7
|
+
- 0
|
8
|
+
- 2
|
9
|
+
- 0
|
10
|
+
version: 0.2.0
|
11
|
+
platform: ruby
|
12
|
+
authors:
|
13
|
+
- "Marius L. J\xC3\xB8hndal"
|
14
|
+
autorequire:
|
15
|
+
bindir: bin
|
16
|
+
cert_chain: []
|
17
|
+
|
18
|
+
date: 2011-12-11 00:00:00 Z
|
19
|
+
dependencies: []
|
20
|
+
|
21
|
+
description: Alignment functions for corpus linguistics.
|
22
|
+
email: mariuslj (at) ifi [dot] uio (dot) no
|
23
|
+
executables: []
|
24
|
+
|
25
|
+
extensions:
|
26
|
+
- ext/gale_church/extconf.rb
|
27
|
+
extra_rdoc_files:
|
28
|
+
- README.rdoc
|
29
|
+
files:
|
30
|
+
- CHANGELOG
|
31
|
+
- Manifest
|
32
|
+
- README.rdoc
|
33
|
+
- Rakefile
|
34
|
+
- VERSION
|
35
|
+
- alignment.gemspec
|
36
|
+
- ext/gale_church/.gitignore
|
37
|
+
- ext/gale_church/extconf.rb
|
38
|
+
- ext/gale_church/gale_church.c
|
39
|
+
- lib/.gitignore
|
40
|
+
- lib/alignment.rb
|
41
|
+
- test/test_alignment.rb
|
42
|
+
homepage: http://github.com/mlj/alignment
|
43
|
+
licenses: []
|
44
|
+
|
45
|
+
post_install_message:
|
46
|
+
rdoc_options: []
|
47
|
+
|
48
|
+
require_paths:
|
49
|
+
- lib
|
50
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
51
|
+
none: false
|
52
|
+
requirements:
|
53
|
+
- - ">="
|
54
|
+
- !ruby/object:Gem::Version
|
55
|
+
hash: 3
|
56
|
+
segments:
|
57
|
+
- 0
|
58
|
+
version: "0"
|
59
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
60
|
+
none: false
|
61
|
+
requirements:
|
62
|
+
- - ">="
|
63
|
+
- !ruby/object:Gem::Version
|
64
|
+
hash: 3
|
65
|
+
segments:
|
66
|
+
- 0
|
67
|
+
version: "0"
|
68
|
+
requirements: []
|
69
|
+
|
70
|
+
rubyforge_project: alignment
|
71
|
+
rubygems_version: 1.8.11
|
72
|
+
signing_key:
|
73
|
+
specification_version: 3
|
74
|
+
summary: Alignment functions for corpus linguistics
|
75
|
+
test_files: []
|
76
|
+
|