alignment 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +1 -0
- data/Manifest +8 -0
- data/README.rdoc +26 -0
- data/Rakefile +16 -0
- data/VERSION +1 -0
- data/alignment.gemspec +47 -0
- data/ext/gale_church/.gitignore +3 -0
- data/ext/gale_church/extconf.rb +5 -0
- data/ext/gale_church/gale_church.c +411 -0
- data/lib/.gitignore +1 -0
- data/lib/alignment.rb +86 -0
- data/test/test_alignment.rb +41 -0
- metadata +76 -0
data/CHANGELOG
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
v0.1.0. initial release
|
data/Manifest
ADDED
data/README.rdoc
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
= alignment
|
2
|
+
|
3
|
+
Alignment functions for corpus linguistics. Currently implemented
|
4
|
+
is the Gale-Church alignment algorithm.
|
5
|
+
|
6
|
+
== Installation
|
7
|
+
|
8
|
+
gem install alignment
|
9
|
+
|
10
|
+
== License
|
11
|
+
|
12
|
+
GPL2
|
13
|
+
|
14
|
+
The Gale-Church implementation was derived from NATools.
|
15
|
+
Implementation is copyright Pernilla Danielsson, Daniel Ridings,
|
16
|
+
and Alberto Simões. Algorithm is copyright William Gale and
|
17
|
+
Kenneth Church.
|
18
|
+
|
19
|
+
Ruby version is copyright Marius L. Jøhndal.
|
20
|
+
|
21
|
+
== Usage
|
22
|
+
|
23
|
+
|
24
|
+
== Development
|
25
|
+
|
26
|
+
The project is hosted on github on http://github.com/mlj/alignment.
|
data/Rakefile
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
begin
|
3
|
+
require 'jeweler'
|
4
|
+
|
5
|
+
Jeweler::Tasks.new do |p|
|
6
|
+
p.name = "alignment"
|
7
|
+
p.summary = "Alignment functions for corpus linguistics"
|
8
|
+
p.description = "Alignment functions for corpus linguistics."
|
9
|
+
p.authors = ['Marius L. Jøhndal']
|
10
|
+
p.email = "mariuslj (at) ifi [dot] uio (dot) no"
|
11
|
+
p.homepage = "http://github.com/mlj/alignment"
|
12
|
+
p.rubyforge_project = "alignment"
|
13
|
+
end
|
14
|
+
rescue LoadError
|
15
|
+
puts "Jeweler not available. Install it with: sudo gem install jeweler"
|
16
|
+
end
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
0.2.0
|
data/alignment.gemspec
ADDED
@@ -0,0 +1,47 @@
|
|
1
|
+
# Generated by jeweler
|
2
|
+
# DO NOT EDIT THIS FILE DIRECTLY
|
3
|
+
# Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
|
4
|
+
# -*- encoding: utf-8 -*-
|
5
|
+
|
6
|
+
Gem::Specification.new do |s|
|
7
|
+
s.name = "alignment"
|
8
|
+
s.version = "0.2.0"
|
9
|
+
|
10
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
|
+
s.authors = ["Marius L. J\303\270hndal"]
|
12
|
+
s.date = "2011-12-03"
|
13
|
+
s.description = "Alignment functions for corpus linguistics."
|
14
|
+
s.email = "mariuslj (at) ifi [dot] uio (dot) no"
|
15
|
+
s.extensions = ["ext/gale_church/extconf.rb"]
|
16
|
+
s.extra_rdoc_files = [
|
17
|
+
"README.rdoc"
|
18
|
+
]
|
19
|
+
s.files = [
|
20
|
+
"CHANGELOG",
|
21
|
+
"Manifest",
|
22
|
+
"README.rdoc",
|
23
|
+
"Rakefile",
|
24
|
+
"VERSION",
|
25
|
+
"ext/gale_church/.gitignore",
|
26
|
+
"ext/gale_church/extconf.rb",
|
27
|
+
"ext/gale_church/gale_church.c",
|
28
|
+
"lib/.gitignore",
|
29
|
+
"lib/alignment.rb",
|
30
|
+
"test/test_alignment.rb"
|
31
|
+
]
|
32
|
+
s.homepage = "http://github.com/mlj/alignment"
|
33
|
+
s.require_paths = ["lib"]
|
34
|
+
s.rubyforge_project = "alignment"
|
35
|
+
s.rubygems_version = "1.8.11"
|
36
|
+
s.summary = "Alignment functions for corpus linguistics"
|
37
|
+
|
38
|
+
if s.respond_to? :specification_version then
|
39
|
+
s.specification_version = 3
|
40
|
+
|
41
|
+
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
42
|
+
else
|
43
|
+
end
|
44
|
+
else
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
@@ -0,0 +1,411 @@
|
|
1
|
+
/* NATools - Package with parallel corpora tools
|
2
|
+
* Original Implementation Copyright:
|
3
|
+
* Pernilla Danielsson and Daniel Ridings
|
4
|
+
* Algorithm Copyright:
|
5
|
+
* William Gale and Kenneth Church
|
6
|
+
* Copyright (C) 2002-2004 Alberto Sim�es
|
7
|
+
|
8
|
+
* Ruby adaptation:
|
9
|
+
* Copyright (C) 2008 Marius L. J�hndal
|
10
|
+
*
|
11
|
+
* This package is free software; you can redistribute it and/or
|
12
|
+
* modify it under the terms of the GNU Lesser General Public
|
13
|
+
* License as published by the Free Software Foundation; either
|
14
|
+
* version 2 of the License, or (at your option) any later version.
|
15
|
+
*
|
16
|
+
* This library is distributed in the hope that it will be useful,
|
17
|
+
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
18
|
+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
19
|
+
* Lesser General Public License for more details.
|
20
|
+
*
|
21
|
+
* You should have received a copy of the GNU Lesser General Public
|
22
|
+
* License along with this library; if not, write to the
|
23
|
+
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
24
|
+
* Boston, MA 02111-1307, USA.
|
25
|
+
*/
|
26
|
+
|
27
|
+
#include "ruby.h"
|
28
|
+
#include <assert.h>
|
29
|
+
#include <math.h>
|
30
|
+
#include <stdio.h>
|
31
|
+
#include <fcntl.h>
|
32
|
+
#include <stdlib.h>
|
33
|
+
#include <unistd.h>
|
34
|
+
#include <string.h>
|
35
|
+
|
36
|
+
#ifndef RSTRING_PTR /* Ruby 1.8 compatibility */
|
37
|
+
#define RARRAY_LEN(ptr) RARRAY(ptr)->len
|
38
|
+
#endif
|
39
|
+
|
40
|
+
VALUE mLogos = Qnil;
|
41
|
+
VALUE mGaleChurch = Qnil;
|
42
|
+
|
43
|
+
void Init_gale_church();
|
44
|
+
|
45
|
+
VALUE method_align(VALUE self, VALUE len1, VALUE len2);
|
46
|
+
|
47
|
+
struct alignment {
|
48
|
+
int x1;
|
49
|
+
int y1;
|
50
|
+
int x2;
|
51
|
+
int y2;
|
52
|
+
int d;
|
53
|
+
};
|
54
|
+
|
55
|
+
static int two_side_distance(int x1, int y1, int x2, int y2);
|
56
|
+
static int seq_align(int *x, int *y, int nx, int ny, int (*dist_funct)(),
|
57
|
+
struct alignment **align);
|
58
|
+
|
59
|
+
void Init_gale_church()
|
60
|
+
{
|
61
|
+
mLogos = rb_define_module("Alignment");
|
62
|
+
mGaleChurch = rb_define_module_under(mLogos, "GaleChurch");
|
63
|
+
rb_define_module_function(mGaleChurch, "align", method_align, 2);
|
64
|
+
}
|
65
|
+
|
66
|
+
VALUE method_align(VALUE self, VALUE len1, VALUE len2)
|
67
|
+
{
|
68
|
+
VALUE ret, res1, res2, grouped_regions1, grouped_regions2;
|
69
|
+
|
70
|
+
int n, i, ix, iy, prevx, prevy;
|
71
|
+
struct alignment *align, *a;
|
72
|
+
int *len1_x;
|
73
|
+
int *len2_x;
|
74
|
+
|
75
|
+
res1 = rb_ary_new();
|
76
|
+
res2 = rb_ary_new();
|
77
|
+
|
78
|
+
len1_x = (int *)malloc(RARRAY_LEN(len1) * sizeof(int));
|
79
|
+
len2_x = (int *)malloc(RARRAY_LEN(len2) * sizeof(int));
|
80
|
+
|
81
|
+
for (i = 0; i < RARRAY_LEN(len1); i++)
|
82
|
+
len1_x[i] = NUM2INT(rb_ary_entry(len1, i));
|
83
|
+
|
84
|
+
for (i = 0; i < RARRAY_LEN(len2); i++)
|
85
|
+
len2_x[i] = NUM2INT(rb_ary_entry(len2, i));
|
86
|
+
|
87
|
+
n = seq_align(len1_x, len2_x, RARRAY_LEN(len1), RARRAY_LEN(len2),
|
88
|
+
two_side_distance, &align);
|
89
|
+
|
90
|
+
free(len1_x);
|
91
|
+
free(len2_x);
|
92
|
+
|
93
|
+
prevx = prevy = ix = iy = 0;
|
94
|
+
|
95
|
+
for (i = 0; i < n; i++) {
|
96
|
+
a = &align[i];
|
97
|
+
|
98
|
+
if (a->x2 > 0)
|
99
|
+
ix++;
|
100
|
+
else if (a->x1 == 0)
|
101
|
+
ix--;
|
102
|
+
|
103
|
+
if (a->y2 > 0)
|
104
|
+
iy++;
|
105
|
+
else if (a->y1 == 0)
|
106
|
+
iy--;
|
107
|
+
|
108
|
+
if (a->x1 == 0 && a->y1 == 0 && a->x2 == 0 && a->y2 == 0) {
|
109
|
+
ix++;
|
110
|
+
iy++;
|
111
|
+
}
|
112
|
+
|
113
|
+
ix++;
|
114
|
+
iy++;
|
115
|
+
|
116
|
+
grouped_regions1 = rb_ary_new();
|
117
|
+
for (; prevx < ix; prevx++)
|
118
|
+
rb_ary_push(grouped_regions1, INT2FIX(prevx));
|
119
|
+
rb_ary_push(res1, grouped_regions1);
|
120
|
+
|
121
|
+
grouped_regions2 = rb_ary_new();
|
122
|
+
for (; prevy < iy; prevy++)
|
123
|
+
rb_ary_push(grouped_regions2, INT2FIX(prevy));
|
124
|
+
rb_ary_push(res2, grouped_regions2);
|
125
|
+
}
|
126
|
+
|
127
|
+
free(align);
|
128
|
+
|
129
|
+
ret = rb_ary_new();
|
130
|
+
rb_ary_push(ret, res1);
|
131
|
+
rb_ary_push(ret, res2);
|
132
|
+
|
133
|
+
return ret;
|
134
|
+
}
|
135
|
+
|
136
|
+
/********************************************************************/
|
137
|
+
|
138
|
+
/**
|
139
|
+
* @file
|
140
|
+
* @brief Sentence-aligner main program
|
141
|
+
*
|
142
|
+
* Sentence-aligner software heavily based on Pernilla Danielsson and
|
143
|
+
* Daniel Ridings implementation of William Gale and Kenneth Church
|
144
|
+
* sentence aligner algorithm
|
145
|
+
*
|
146
|
+
* The compiled program is used as
|
147
|
+
* <pre>
|
148
|
+
* nat-sentence-aligner -D '.PARA' -d '.End of Sentence' file1 file2
|
149
|
+
* </pre>
|
150
|
+
* where both <i>file1</i> and <i>file2</i> are tokenized and with a
|
151
|
+
* token per line. In the example, '.PARA' is the hard delimiter,
|
152
|
+
* and '.End of Sentence' the soft delimiter. They are considered as
|
153
|
+
* single tokens, and as such, should appear in a line by themselves.
|
154
|
+
*
|
155
|
+
* The program is allowed to delete soft delimiters as necessary in
|
156
|
+
* order to align the files, but it cannot change hard delimiters.
|
157
|
+
*
|
158
|
+
* The output will be written in two files: file1.al and file2.al
|
159
|
+
* where each one contains aligned sentences.
|
160
|
+
*
|
161
|
+
* For debugging it can be useful to output the data in just one file.
|
162
|
+
* In this case, use the '-s' switch and just the 'file1.al' will be
|
163
|
+
* created.
|
164
|
+
*
|
165
|
+
* @todo Check if we want to document all the functions
|
166
|
+
*/
|
167
|
+
|
168
|
+
|
169
|
+
#define dist(x,y) distances[(x) * ((ny) + 1) + (y)]
|
170
|
+
#define pathx(x,y) path_x[(x) * ((ny) + 1) + (y)]
|
171
|
+
#define pathy(x,y) path_y[(x) * ((ny) + 1) + (y)]
|
172
|
+
|
173
|
+
/**
|
174
|
+
* @brief Contant representing a big align distance between two
|
175
|
+
* sentences
|
176
|
+
*/
|
177
|
+
#define BIG_DISTANCE 2500
|
178
|
+
|
179
|
+
/*
|
180
|
+
|
181
|
+
seq_align by Mike Riley
|
182
|
+
Sequence alignment routine.
|
183
|
+
This version allows for contraction/expansions.
|
184
|
+
|
185
|
+
x and y are sequences of objects, represented as non-zero ints, to be aligned.
|
186
|
+
|
187
|
+
dist_funct(x1, y1, x2, y2) is a distance function of 4 args:
|
188
|
+
|
189
|
+
dist_funct(x1, y1, 0, 0) gives cost of substitution of x1 by y1.
|
190
|
+
dist_funct(x1, 0, 0, 0) gives cost of deletion of x1.
|
191
|
+
dist_funct(0, y1, 0, 0) gives cost of insertion of y1.
|
192
|
+
dist_funct(x1, y1, x2, 0) gives cost of contraction of (x1,x2) to y1.
|
193
|
+
dist_funct(x1, y1, 0, y2) gives cost of expansion of x1 to (y1,y2).
|
194
|
+
dist_funct(x1, y1, x2, y2) gives cost to match (x1,x2) to (y1,y2).
|
195
|
+
|
196
|
+
align is the alignment, with (align[i].x1, align[i].x2) aligned
|
197
|
+
with (align[i].y1, align[i].y2). Zero in align[].x1 and align[].y1
|
198
|
+
correspond to insertion and deletion, respectively. Non-zero in
|
199
|
+
align[].x2 and align[].y2 correspond to contraction and expansion,
|
200
|
+
respectively. align[].d gives the distance for that pairing.
|
201
|
+
|
202
|
+
The function returns the length of the alignment.
|
203
|
+
|
204
|
+
*/
|
205
|
+
|
206
|
+
static int seq_align(int *x, int *y, int nx, int ny, int (*dist_funct)(),
|
207
|
+
struct alignment **align)
|
208
|
+
{
|
209
|
+
int *distances, *path_x, *path_y, n;
|
210
|
+
int i, j, oi, oj, di, dj, d1, d2, d3, d4, d5, d6, dmin;
|
211
|
+
struct alignment *ralign;
|
212
|
+
|
213
|
+
distances = (int *) malloc((nx + 1) * (ny + 1) * sizeof(int));
|
214
|
+
path_x = (int *) malloc((nx + 1) * (ny + 1) * sizeof(int));
|
215
|
+
path_y = (int *) malloc((nx + 1) * (ny + 1) * sizeof(int));
|
216
|
+
ralign = (struct alignment *) malloc((nx + ny) * sizeof(struct alignment));
|
217
|
+
|
218
|
+
for (j = 0; j <= ny; j++) {
|
219
|
+
for (i = 0; i <= nx; i++) {
|
220
|
+
d1 = i>0 && j>0 ? /* substitution */
|
221
|
+
dist(i-1, j-1) + (*dist_funct)(x[i-1], y[j-1], 0, 0)
|
222
|
+
: INT_MAX;
|
223
|
+
d2 = i>0 ? /* deletion */
|
224
|
+
dist(i-1, j) + (*dist_funct)(x[i-1], 0, 0, 0)
|
225
|
+
: INT_MAX;
|
226
|
+
d3 = j>0 ? /* insertion */
|
227
|
+
dist(i, j-1) + (*dist_funct)(0, y[j-1], 0, 0)
|
228
|
+
: INT_MAX;
|
229
|
+
d4 = i>1 && j>0 ? /* contraction */
|
230
|
+
dist(i-2, j-1) + (*dist_funct)(x[i-2], y[j-1], x[i-1], 0)
|
231
|
+
: INT_MAX;
|
232
|
+
d5 = i>0 && j>1 ? /* expansion */
|
233
|
+
dist(i-1, j-2) + (*dist_funct)(x[i-1], y[j-2], 0, y[j-1])
|
234
|
+
: INT_MAX;
|
235
|
+
d6 = i>1 && j>1 ? /* melding */
|
236
|
+
dist(i-2, j-2) + (*dist_funct)(x[i-2], y[j-2], x[i-1], y[j-1])
|
237
|
+
: INT_MAX;
|
238
|
+
|
239
|
+
dmin = d1;
|
240
|
+
if (d2<dmin) dmin=d2;
|
241
|
+
if (d3<dmin) dmin=d3;
|
242
|
+
if (d4<dmin) dmin=d4;
|
243
|
+
if (d5<dmin) dmin=d5;
|
244
|
+
if (d6<dmin) dmin=d6;
|
245
|
+
|
246
|
+
if (dmin == INT_MAX) {
|
247
|
+
dist(i,j) = 0;
|
248
|
+
}
|
249
|
+
else if (dmin == d1) {
|
250
|
+
dist(i,j) = d1;
|
251
|
+
pathx(i,j) = i-1;
|
252
|
+
pathy(i,j) = j-1;
|
253
|
+
}
|
254
|
+
else if (dmin == d2) {
|
255
|
+
dist(i,j) = d2;
|
256
|
+
pathx(i,j) = i-1;
|
257
|
+
pathy(i,j) = j;
|
258
|
+
}
|
259
|
+
else if (dmin == d3) {
|
260
|
+
dist(i,j) = d3;
|
261
|
+
pathx(i,j) = i;
|
262
|
+
pathy(i,j) = j-1;
|
263
|
+
}
|
264
|
+
else if (dmin == d4) {
|
265
|
+
dist(i,j) = d4;
|
266
|
+
pathx(i,j) = i-2;
|
267
|
+
pathy(i,j) = j-1;
|
268
|
+
}
|
269
|
+
else if (dmin == d5){
|
270
|
+
dist(i,j) = d5;
|
271
|
+
pathx(i,j) = i-1;
|
272
|
+
pathy(i,j) = j-2;
|
273
|
+
}
|
274
|
+
else /* dmin == d6 */ {
|
275
|
+
dist(i,j) = d6;
|
276
|
+
pathx(i,j) = i-2;
|
277
|
+
pathy(i,j) = j-2;
|
278
|
+
}
|
279
|
+
}
|
280
|
+
}
|
281
|
+
|
282
|
+
n = 0;
|
283
|
+
for (i=nx, j=ny ; i>0 || j>0 ; i = oi, j = oj) {
|
284
|
+
oi = pathx(i, j);
|
285
|
+
oj = pathy(i, j);
|
286
|
+
di = i - oi;
|
287
|
+
dj = j - oj;
|
288
|
+
|
289
|
+
if (di == 1 && dj == 1) { /* substitution */
|
290
|
+
ralign[n].x1 = x[i-1];
|
291
|
+
ralign[n].y1 = y[j-1];
|
292
|
+
ralign[n].x2 = 0;
|
293
|
+
ralign[n].y2 = 0;
|
294
|
+
ralign[n++].d = dist(i, j) - dist(i-1, j-1);
|
295
|
+
}
|
296
|
+
|
297
|
+
else if (di == 1 && dj == 0) { /* deletion */
|
298
|
+
ralign[n].x1 = x[i-1];
|
299
|
+
ralign[n].y1 = 0;
|
300
|
+
ralign[n].x2 = 0;
|
301
|
+
ralign[n].y2 = 0;
|
302
|
+
ralign[n++].d = dist(i, j) - dist(i-1, j);
|
303
|
+
}
|
304
|
+
|
305
|
+
else if (di == 0 && dj == 1) { /* insertion */
|
306
|
+
ralign[n].x1 = 0;
|
307
|
+
ralign[n].y1 = y[j-1];
|
308
|
+
ralign[n].x2 = 0;
|
309
|
+
ralign[n].y2 = 0;
|
310
|
+
ralign[n++].d = dist(i, j) - dist(i, j-1);
|
311
|
+
}
|
312
|
+
|
313
|
+
else if (dj == 1) { /* contraction */
|
314
|
+
ralign[n].x1 = x[i-2];
|
315
|
+
ralign[n].y1 = y[j-1];
|
316
|
+
ralign[n].x2 = x[i-1];
|
317
|
+
ralign[n].y2 = 0;
|
318
|
+
ralign[n++].d = dist(i, j) - dist(i-2, j-1);
|
319
|
+
}
|
320
|
+
|
321
|
+
else if (di == 1) { /* expansion */
|
322
|
+
ralign[n].x1 = x[i-1];
|
323
|
+
ralign[n].y1 = y[j-2];
|
324
|
+
ralign[n].x2 = 0;
|
325
|
+
ralign[n].y2 = y[j-1];
|
326
|
+
ralign[n++].d = dist(i, j) - dist(i-1, j-2);
|
327
|
+
}
|
328
|
+
else /* di == 2 && dj == 2 */ { /* melding */
|
329
|
+
ralign[n].x1 = x[i-2];
|
330
|
+
ralign[n].y1 = y[j-2];
|
331
|
+
ralign[n].x2 = x[i-1];
|
332
|
+
ralign[n].y2 = y[j-1];
|
333
|
+
ralign[n++].d = dist(i, j) - dist(i-2, j-2);
|
334
|
+
}
|
335
|
+
}
|
336
|
+
|
337
|
+
*align = (struct alignment *) malloc(n * sizeof(struct alignment));
|
338
|
+
|
339
|
+
for (i=0; i<n; i++)
|
340
|
+
bcopy(ralign + i, (*align) + (n-i-1), sizeof(struct alignment));
|
341
|
+
|
342
|
+
free(distances);
|
343
|
+
free(path_x);
|
344
|
+
free(path_y);
|
345
|
+
free(ralign);
|
346
|
+
return(n);
|
347
|
+
}
|
348
|
+
|
349
|
+
/* Returns the area under a normal distribution
|
350
|
+
from -inf to z standard deviations */
|
351
|
+
static double pnorm(double z)
|
352
|
+
{
|
353
|
+
double t, pd;
|
354
|
+
t = 1/(1 + 0.2316419 * z);
|
355
|
+
pd = 1 - 0.3989423 *
|
356
|
+
exp(-z * z/2) *
|
357
|
+
((((1.330274429 * t - 1.821255978) * t
|
358
|
+
+ 1.781477937) * t - 0.356563782) * t + 0.319381530) * t;
|
359
|
+
/* see Gradsteyn & Rhyzik, 26.2.17 p932 */
|
360
|
+
return(pd);
|
361
|
+
}
|
362
|
+
|
363
|
+
|
364
|
+
/* Return -100 * log probability that an English sentence of length
|
365
|
+
len1 is a translation of a foreign sentence of length len2. The
|
366
|
+
probability is based on two parameters, the mean and variance of
|
367
|
+
number of foreign characters per English character.
|
368
|
+
*/
|
369
|
+
|
370
|
+
static int match(int len1,int len2)
|
371
|
+
{
|
372
|
+
double z, pd, mean;
|
373
|
+
|
374
|
+
/* foreign characters per english character */
|
375
|
+
double foreign_chars_per_eng_char = 1;
|
376
|
+
|
377
|
+
/* variance per english character */
|
378
|
+
double var_per_eng_char = 6.8 ;
|
379
|
+
|
380
|
+
if (len1==0 && len2==0) return(0);
|
381
|
+
mean = (len1 + len2/foreign_chars_per_eng_char)/2;
|
382
|
+
z = (foreign_chars_per_eng_char * len1 - len2)/sqrt(var_per_eng_char * mean);
|
383
|
+
|
384
|
+
/* Need to deal with both sides of the normal distribution */
|
385
|
+
if (z < 0) z = -z;
|
386
|
+
pd = 2 * (1 - pnorm(z));
|
387
|
+
|
388
|
+
if (pd > 0) return((int)(-100 * log(pd)));
|
389
|
+
else return(BIG_DISTANCE);
|
390
|
+
}
|
391
|
+
|
392
|
+
static int two_side_distance(int x1, int y1, int x2, int y2)
|
393
|
+
{
|
394
|
+
int penalty21 = 230; /* -100 * log([prob of 2-1 match] / [prob of 1-1 match]) */
|
395
|
+
int penalty22 = 440; /* -100 * log([prob of 2-2 match] / [prob of 1-1 match]) */
|
396
|
+
int penalty01 = 450; /* -100 * log([prob of 0-1 match] / [prob of 1-1 match]) */
|
397
|
+
|
398
|
+
if (x2 == 0 && y2 == 0)
|
399
|
+
if (x1 == 0) /* insertion */
|
400
|
+
return(match(x1, y1) + penalty01);
|
401
|
+
else if (y1 == 0) /* deletion */
|
402
|
+
return(match(x1, y1) + penalty01);
|
403
|
+
else
|
404
|
+
return (match(x1, y1)); /* substitution */
|
405
|
+
else if (x2 == 0) /* expansion */
|
406
|
+
return (match(x1, y1 + y2) + penalty21);
|
407
|
+
else if (y2 == 0) /* contraction */
|
408
|
+
return(match(x1 + x2, y1) + penalty21);
|
409
|
+
else /* melding */
|
410
|
+
return(match(x1 + x2, y1 + y2) + penalty22);
|
411
|
+
}
|
data/lib/.gitignore
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
*.so
|
data/lib/alignment.rb
ADDED
@@ -0,0 +1,86 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# alignment.rb - Alignment functions
|
4
|
+
#
|
5
|
+
# Written by Marius L. Jøhndal <mariuslj at ifi.uio.no>, 2008
|
6
|
+
#
|
7
|
+
require 'gale_church'
|
8
|
+
|
9
|
+
module Alignment
|
10
|
+
def self.is_alignable?(o)
|
11
|
+
o.responds_to?(:weight)
|
12
|
+
end
|
13
|
+
|
14
|
+
# Represents an alignment of two regions, one +left+ region and one
|
15
|
+
# +right+ region. Both regions are arrays of alignable objects.
|
16
|
+
AlignedRegions = Struct.new(:left, :right)
|
17
|
+
|
18
|
+
class AlignedRegions
|
19
|
+
def to_s
|
20
|
+
"<#{self.left},#{self.right}>"
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
public
|
25
|
+
|
26
|
+
# Aligns two arrays of regions +a+ and +b+. Returns an array of
|
27
|
+
# +AlignedRegions+.
|
28
|
+
def self.align_regions(a, b, method = :gale_church)
|
29
|
+
case method
|
30
|
+
when :gale_church
|
31
|
+
x, y = Alignment::GaleChurch::align(a.collect(&:weight), b.collect(&:weight))
|
32
|
+
raise "Error aligning regions: returned block count does not match" unless x.length == y.length
|
33
|
+
|
34
|
+
x.zip(y).collect { |r, s| AlignedRegions.new(r.collect { |i| a[i] }, s.collect { |i| b[i] }) }
|
35
|
+
else
|
36
|
+
raise ArgumentError.new("invalid method #{method}")
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
class AlignableText
|
41
|
+
attr_reader :text
|
42
|
+
|
43
|
+
def initialize(text)
|
44
|
+
@text = text
|
45
|
+
end
|
46
|
+
|
47
|
+
def weight
|
48
|
+
# FIXME: character length
|
49
|
+
@text.length
|
50
|
+
# FIXME: word count
|
51
|
+
@text.split(' ').length
|
52
|
+
end
|
53
|
+
|
54
|
+
def to_s
|
55
|
+
@text
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
ANCHOR_REGEXP = /\s*\|\|\s*/
|
60
|
+
BOUNDARY_REGEXP = /\s*\|\s*/
|
61
|
+
|
62
|
+
# Aligns to strings of text +a+ and +b+ using method +method+. The boundaries of the
|
63
|
+
# blocks to be aligned are denoted by a | character (with or without surrounding
|
64
|
+
# white-space). The sequence || denotes an anchor (or hard delimiter), i.e. a fixed
|
65
|
+
# synchronisation point. There must be the same number of anchors in both strings,
|
66
|
+
# but the number of boundaries may be different. The returned value is an array
|
67
|
+
# of pairs of aligned strings.
|
68
|
+
def self.align_text(a, b, method = :gale_church)
|
69
|
+
result = []
|
70
|
+
|
71
|
+
regions_a = a.split(ANCHOR_REGEXP)
|
72
|
+
regions_b = b.split(ANCHOR_REGEXP)
|
73
|
+
|
74
|
+
raise ArgumentError.new("different number of anchors in strings") unless regions_a.length == regions_b.length
|
75
|
+
|
76
|
+
regions_a.zip(regions_b).each do |x, y|
|
77
|
+
r = align_regions(x.split(BOUNDARY_REGEXP).collect { |region| AlignableText.new(region) },
|
78
|
+
y.split(BOUNDARY_REGEXP).collect { |region| AlignableText.new(region) },
|
79
|
+
method)
|
80
|
+
r.collect! { |i| [ i.left.collect(&:text).join(' '), i.right.collect(&:text).join(' ') ] }
|
81
|
+
result += r
|
82
|
+
end
|
83
|
+
|
84
|
+
result
|
85
|
+
end
|
86
|
+
end
|
@@ -0,0 +1,41 @@
|
|
1
|
+
require 'alignment'
|
2
|
+
require 'test/unit'
|
3
|
+
|
4
|
+
class AlignmentTestCase < Test::Unit::TestCase
|
5
|
+
def test_church_alignment
|
6
|
+
x = [ Alignment::AlignableText.new("foo") ]
|
7
|
+
y = [ Alignment::AlignableText.new("bar") ]
|
8
|
+
assert_equal "<foo,bar>", Alignment::align_regions(x, y).collect(&:to_s).join(",")
|
9
|
+
|
10
|
+
x = [ Alignment::AlignableText.new("foo") ]
|
11
|
+
y = [ Alignment::AlignableText.new("bar"), Alignment::AlignableText.new("baz") ]
|
12
|
+
assert_equal "<foo,barbaz>", Alignment::align_regions(x, y).collect(&:to_s).join(",")
|
13
|
+
|
14
|
+
x = [ ]
|
15
|
+
y = [ Alignment::AlignableText.new("bar") ]
|
16
|
+
assert_equal "<,bar>", Alignment::align_regions(x, y).collect(&:to_s).join(",")
|
17
|
+
|
18
|
+
x = [ ]
|
19
|
+
y = [ ]
|
20
|
+
assert_equal "", Alignment::align_regions(x, y).collect(&:to_s).join(",")
|
21
|
+
|
22
|
+
x = [ Alignment::AlignableText.new("foo"), Alignment::AlignableText.new("koo") ]
|
23
|
+
y = [ Alignment::AlignableText.new("bar"), Alignment::AlignableText.new("baz") ]
|
24
|
+
assert_equal "<foo,bar>,<koo,baz>", Alignment::align_regions(x, y).collect(&:to_s).join(",")
|
25
|
+
end
|
26
|
+
|
27
|
+
def test_church_text_alignment
|
28
|
+
assert_equal [["The quick brown fox", "Den kvikke brune reven"], ["jumps", "hopper"], ["over the lazy dog", "over den trege hunden"]],
|
29
|
+
Alignment::align_text("The quick brown fox | jumps | over the lazy dog",
|
30
|
+
"Den kvikke brune reven | hopper | over den trege hunden")
|
31
|
+
|
32
|
+
assert_equal [["The quick brown fox", "Den kvikke brune reven"], ["jumps", "hopper elegant"], ["over the lazy dog", "over den trege hunden"]],
|
33
|
+
Alignment::align_text("The quick brown fox | jumps | over the lazy dog",
|
34
|
+
"Den kvikke brune reven | hopper elegant | over den trege hunden")
|
35
|
+
|
36
|
+
assert_equal [["The quick brown fox", "Den kvikke brune reven"], ["jumps", "hopper"], ["over the lazy dog", "elegant over den trege hunden"]],
|
37
|
+
Alignment::align_text("The quick brown fox | jumps || over the lazy dog",
|
38
|
+
"Den kvikke brune reven | hopper || elegant | over den trege hunden")
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
metadata
ADDED
@@ -0,0 +1,76 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: alignment
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
hash: 23
|
5
|
+
prerelease:
|
6
|
+
segments:
|
7
|
+
- 0
|
8
|
+
- 2
|
9
|
+
- 0
|
10
|
+
version: 0.2.0
|
11
|
+
platform: ruby
|
12
|
+
authors:
|
13
|
+
- "Marius L. J\xC3\xB8hndal"
|
14
|
+
autorequire:
|
15
|
+
bindir: bin
|
16
|
+
cert_chain: []
|
17
|
+
|
18
|
+
date: 2011-12-11 00:00:00 Z
|
19
|
+
dependencies: []
|
20
|
+
|
21
|
+
description: Alignment functions for corpus linguistics.
|
22
|
+
email: mariuslj (at) ifi [dot] uio (dot) no
|
23
|
+
executables: []
|
24
|
+
|
25
|
+
extensions:
|
26
|
+
- ext/gale_church/extconf.rb
|
27
|
+
extra_rdoc_files:
|
28
|
+
- README.rdoc
|
29
|
+
files:
|
30
|
+
- CHANGELOG
|
31
|
+
- Manifest
|
32
|
+
- README.rdoc
|
33
|
+
- Rakefile
|
34
|
+
- VERSION
|
35
|
+
- alignment.gemspec
|
36
|
+
- ext/gale_church/.gitignore
|
37
|
+
- ext/gale_church/extconf.rb
|
38
|
+
- ext/gale_church/gale_church.c
|
39
|
+
- lib/.gitignore
|
40
|
+
- lib/alignment.rb
|
41
|
+
- test/test_alignment.rb
|
42
|
+
homepage: http://github.com/mlj/alignment
|
43
|
+
licenses: []
|
44
|
+
|
45
|
+
post_install_message:
|
46
|
+
rdoc_options: []
|
47
|
+
|
48
|
+
require_paths:
|
49
|
+
- lib
|
50
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
51
|
+
none: false
|
52
|
+
requirements:
|
53
|
+
- - ">="
|
54
|
+
- !ruby/object:Gem::Version
|
55
|
+
hash: 3
|
56
|
+
segments:
|
57
|
+
- 0
|
58
|
+
version: "0"
|
59
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
60
|
+
none: false
|
61
|
+
requirements:
|
62
|
+
- - ">="
|
63
|
+
- !ruby/object:Gem::Version
|
64
|
+
hash: 3
|
65
|
+
segments:
|
66
|
+
- 0
|
67
|
+
version: "0"
|
68
|
+
requirements: []
|
69
|
+
|
70
|
+
rubyforge_project: alignment
|
71
|
+
rubygems_version: 1.8.11
|
72
|
+
signing_key:
|
73
|
+
specification_version: 3
|
74
|
+
summary: Alignment functions for corpus linguistics
|
75
|
+
test_files: []
|
76
|
+
|