alignment 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
data/CHANGELOG ADDED
@@ -0,0 +1 @@
1
+ v0.1.0. initial release
data/Manifest ADDED
@@ -0,0 +1,8 @@
1
+ README.rdoc
2
+ Rakefile
3
+ test/test_alignment.rb
4
+ CHANGELOG
5
+ ext/gale_church/gale_church.c
6
+ ext/gale_church/extconf.rb
7
+ lib/alignment.rb
8
+ Manifest
data/README.rdoc ADDED
@@ -0,0 +1,26 @@
1
+ = alignment
2
+
3
+ Alignment functions for corpus linguistics. Currently implemented
4
+ is the Gale-Church alignment algorithm.
5
+
6
+ == Installation
7
+
8
+ gem install alignment
9
+
10
+ == License
11
+
12
+ GPL2
13
+
14
+ The Gale-Church implementation was derived from NATools.
15
+ Implementation is copyright Pernilla Danielsson, Daniel Ridings,
16
+ and Alberto Simões. Algorithm is copyright William Gale and
17
+ Kenneth Church.
18
+
19
+ Ruby version is copyright Marius L. Jøhndal.
20
+
21
+ == Usage
22
+
23
+
24
+ == Development
25
+
26
+ The project is hosted on github on http://github.com/mlj/alignment.
data/Rakefile ADDED
@@ -0,0 +1,16 @@
1
+ # -*- encoding: utf-8 -*-
2
+ begin
3
+ require 'jeweler'
4
+
5
+ Jeweler::Tasks.new do |p|
6
+ p.name = "alignment"
7
+ p.summary = "Alignment functions for corpus linguistics"
8
+ p.description = "Alignment functions for corpus linguistics."
9
+ p.authors = ['Marius L. Jøhndal']
10
+ p.email = "mariuslj (at) ifi [dot] uio (dot) no"
11
+ p.homepage = "http://github.com/mlj/alignment"
12
+ p.rubyforge_project = "alignment"
13
+ end
14
+ rescue LoadError
15
+ puts "Jeweler not available. Install it with: sudo gem install jeweler"
16
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.2.0
data/alignment.gemspec ADDED
@@ -0,0 +1,47 @@
1
+ # Generated by jeweler
2
+ # DO NOT EDIT THIS FILE DIRECTLY
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
4
+ # -*- encoding: utf-8 -*-
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = "alignment"
8
+ s.version = "0.2.0"
9
+
10
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
+ s.authors = ["Marius L. J\303\270hndal"]
12
+ s.date = "2011-12-03"
13
+ s.description = "Alignment functions for corpus linguistics."
14
+ s.email = "mariuslj (at) ifi [dot] uio (dot) no"
15
+ s.extensions = ["ext/gale_church/extconf.rb"]
16
+ s.extra_rdoc_files = [
17
+ "README.rdoc"
18
+ ]
19
+ s.files = [
20
+ "CHANGELOG",
21
+ "Manifest",
22
+ "README.rdoc",
23
+ "Rakefile",
24
+ "VERSION",
25
+ "ext/gale_church/.gitignore",
26
+ "ext/gale_church/extconf.rb",
27
+ "ext/gale_church/gale_church.c",
28
+ "lib/.gitignore",
29
+ "lib/alignment.rb",
30
+ "test/test_alignment.rb"
31
+ ]
32
+ s.homepage = "http://github.com/mlj/alignment"
33
+ s.require_paths = ["lib"]
34
+ s.rubyforge_project = "alignment"
35
+ s.rubygems_version = "1.8.11"
36
+ s.summary = "Alignment functions for corpus linguistics"
37
+
38
+ if s.respond_to? :specification_version then
39
+ s.specification_version = 3
40
+
41
+ if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
42
+ else
43
+ end
44
+ else
45
+ end
46
+ end
47
+
@@ -0,0 +1,3 @@
1
+ Makefile
2
+ *.o
3
+ *.so
@@ -0,0 +1,5 @@
1
+ require 'mkmf'
2
+ extension_name = 'gale_church'
3
+ dir_config(extension_name)
4
+ $CFLAGS='-Wall'
5
+ create_makefile(extension_name)
@@ -0,0 +1,411 @@
1
+ /* NATools - Package with parallel corpora tools
2
+ * Original Implementation Copyright:
3
+ * Pernilla Danielsson and Daniel Ridings
4
+ * Algorithm Copyright:
5
+ * William Gale and Kenneth Church
6
+ * Copyright (C) 2002-2004 Alberto Sim�es
7
+
8
+ * Ruby adaptation:
9
+ * Copyright (C) 2008 Marius L. J�hndal
10
+ *
11
+ * This package is free software; you can redistribute it and/or
12
+ * modify it under the terms of the GNU Lesser General Public
13
+ * License as published by the Free Software Foundation; either
14
+ * version 2 of the License, or (at your option) any later version.
15
+ *
16
+ * This library is distributed in the hope that it will be useful,
17
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
18
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19
+ * Lesser General Public License for more details.
20
+ *
21
+ * You should have received a copy of the GNU Lesser General Public
22
+ * License along with this library; if not, write to the
23
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24
+ * Boston, MA 02111-1307, USA.
25
+ */
26
+
27
+ #include "ruby.h"
28
+ #include <assert.h>
29
+ #include <math.h>
30
+ #include <stdio.h>
31
+ #include <fcntl.h>
32
+ #include <stdlib.h>
33
+ #include <unistd.h>
34
+ #include <string.h>
35
+
36
+ #ifndef RSTRING_PTR /* Ruby 1.8 compatibility */
37
+ #define RARRAY_LEN(ptr) RARRAY(ptr)->len
38
+ #endif
39
+
40
+ VALUE mLogos = Qnil;
41
+ VALUE mGaleChurch = Qnil;
42
+
43
+ void Init_gale_church();
44
+
45
+ VALUE method_align(VALUE self, VALUE len1, VALUE len2);
46
+
47
+ struct alignment {
48
+ int x1;
49
+ int y1;
50
+ int x2;
51
+ int y2;
52
+ int d;
53
+ };
54
+
55
+ static int two_side_distance(int x1, int y1, int x2, int y2);
56
+ static int seq_align(int *x, int *y, int nx, int ny, int (*dist_funct)(),
57
+ struct alignment **align);
58
+
59
+ void Init_gale_church()
60
+ {
61
+ mLogos = rb_define_module("Alignment");
62
+ mGaleChurch = rb_define_module_under(mLogos, "GaleChurch");
63
+ rb_define_module_function(mGaleChurch, "align", method_align, 2);
64
+ }
65
+
66
+ VALUE method_align(VALUE self, VALUE len1, VALUE len2)
67
+ {
68
+ VALUE ret, res1, res2, grouped_regions1, grouped_regions2;
69
+
70
+ int n, i, ix, iy, prevx, prevy;
71
+ struct alignment *align, *a;
72
+ int *len1_x;
73
+ int *len2_x;
74
+
75
+ res1 = rb_ary_new();
76
+ res2 = rb_ary_new();
77
+
78
+ len1_x = (int *)malloc(RARRAY_LEN(len1) * sizeof(int));
79
+ len2_x = (int *)malloc(RARRAY_LEN(len2) * sizeof(int));
80
+
81
+ for (i = 0; i < RARRAY_LEN(len1); i++)
82
+ len1_x[i] = NUM2INT(rb_ary_entry(len1, i));
83
+
84
+ for (i = 0; i < RARRAY_LEN(len2); i++)
85
+ len2_x[i] = NUM2INT(rb_ary_entry(len2, i));
86
+
87
+ n = seq_align(len1_x, len2_x, RARRAY_LEN(len1), RARRAY_LEN(len2),
88
+ two_side_distance, &align);
89
+
90
+ free(len1_x);
91
+ free(len2_x);
92
+
93
+ prevx = prevy = ix = iy = 0;
94
+
95
+ for (i = 0; i < n; i++) {
96
+ a = &align[i];
97
+
98
+ if (a->x2 > 0)
99
+ ix++;
100
+ else if (a->x1 == 0)
101
+ ix--;
102
+
103
+ if (a->y2 > 0)
104
+ iy++;
105
+ else if (a->y1 == 0)
106
+ iy--;
107
+
108
+ if (a->x1 == 0 && a->y1 == 0 && a->x2 == 0 && a->y2 == 0) {
109
+ ix++;
110
+ iy++;
111
+ }
112
+
113
+ ix++;
114
+ iy++;
115
+
116
+ grouped_regions1 = rb_ary_new();
117
+ for (; prevx < ix; prevx++)
118
+ rb_ary_push(grouped_regions1, INT2FIX(prevx));
119
+ rb_ary_push(res1, grouped_regions1);
120
+
121
+ grouped_regions2 = rb_ary_new();
122
+ for (; prevy < iy; prevy++)
123
+ rb_ary_push(grouped_regions2, INT2FIX(prevy));
124
+ rb_ary_push(res2, grouped_regions2);
125
+ }
126
+
127
+ free(align);
128
+
129
+ ret = rb_ary_new();
130
+ rb_ary_push(ret, res1);
131
+ rb_ary_push(ret, res2);
132
+
133
+ return ret;
134
+ }
135
+
136
+ /********************************************************************/
137
+
138
+ /**
139
+ * @file
140
+ * @brief Sentence-aligner main program
141
+ *
142
+ * Sentence-aligner software heavily based on Pernilla Danielsson and
143
+ * Daniel Ridings implementation of William Gale and Kenneth Church
144
+ * sentence aligner algorithm
145
+ *
146
+ * The compiled program is used as
147
+ * <pre>
148
+ * nat-sentence-aligner -D '.PARA' -d '.End of Sentence' file1 file2
149
+ * </pre>
150
+ * where both <i>file1</i> and <i>file2</i> are tokenized and with a
151
+ * token per line. In the example, '.PARA' is the hard delimiter,
152
+ * and '.End of Sentence' the soft delimiter. They are considered as
153
+ * single tokens, and as such, should appear in a line by themselves.
154
+ *
155
+ * The program is allowed to delete soft delimiters as necessary in
156
+ * order to align the files, but it cannot change hard delimiters.
157
+ *
158
+ * The output will be written in two files: file1.al and file2.al
159
+ * where each one contains aligned sentences.
160
+ *
161
+ * For debugging it can be useful to output the data in just one file.
162
+ * In this case, use the '-s' switch and just the 'file1.al' will be
163
+ * created.
164
+ *
165
+ * @todo Check if we want to document all the functions
166
+ */
167
+
168
+
169
+ #define dist(x,y) distances[(x) * ((ny) + 1) + (y)]
170
+ #define pathx(x,y) path_x[(x) * ((ny) + 1) + (y)]
171
+ #define pathy(x,y) path_y[(x) * ((ny) + 1) + (y)]
172
+
173
+ /**
174
+ * @brief Contant representing a big align distance between two
175
+ * sentences
176
+ */
177
+ #define BIG_DISTANCE 2500
178
+
179
+ /*
180
+
181
+ seq_align by Mike Riley
182
+ Sequence alignment routine.
183
+ This version allows for contraction/expansions.
184
+
185
+ x and y are sequences of objects, represented as non-zero ints, to be aligned.
186
+
187
+ dist_funct(x1, y1, x2, y2) is a distance function of 4 args:
188
+
189
+ dist_funct(x1, y1, 0, 0) gives cost of substitution of x1 by y1.
190
+ dist_funct(x1, 0, 0, 0) gives cost of deletion of x1.
191
+ dist_funct(0, y1, 0, 0) gives cost of insertion of y1.
192
+ dist_funct(x1, y1, x2, 0) gives cost of contraction of (x1,x2) to y1.
193
+ dist_funct(x1, y1, 0, y2) gives cost of expansion of x1 to (y1,y2).
194
+ dist_funct(x1, y1, x2, y2) gives cost to match (x1,x2) to (y1,y2).
195
+
196
+ align is the alignment, with (align[i].x1, align[i].x2) aligned
197
+ with (align[i].y1, align[i].y2). Zero in align[].x1 and align[].y1
198
+ correspond to insertion and deletion, respectively. Non-zero in
199
+ align[].x2 and align[].y2 correspond to contraction and expansion,
200
+ respectively. align[].d gives the distance for that pairing.
201
+
202
+ The function returns the length of the alignment.
203
+
204
+ */
205
+
206
+ static int seq_align(int *x, int *y, int nx, int ny, int (*dist_funct)(),
207
+ struct alignment **align)
208
+ {
209
+ int *distances, *path_x, *path_y, n;
210
+ int i, j, oi, oj, di, dj, d1, d2, d3, d4, d5, d6, dmin;
211
+ struct alignment *ralign;
212
+
213
+ distances = (int *) malloc((nx + 1) * (ny + 1) * sizeof(int));
214
+ path_x = (int *) malloc((nx + 1) * (ny + 1) * sizeof(int));
215
+ path_y = (int *) malloc((nx + 1) * (ny + 1) * sizeof(int));
216
+ ralign = (struct alignment *) malloc((nx + ny) * sizeof(struct alignment));
217
+
218
+ for (j = 0; j <= ny; j++) {
219
+ for (i = 0; i <= nx; i++) {
220
+ d1 = i>0 && j>0 ? /* substitution */
221
+ dist(i-1, j-1) + (*dist_funct)(x[i-1], y[j-1], 0, 0)
222
+ : INT_MAX;
223
+ d2 = i>0 ? /* deletion */
224
+ dist(i-1, j) + (*dist_funct)(x[i-1], 0, 0, 0)
225
+ : INT_MAX;
226
+ d3 = j>0 ? /* insertion */
227
+ dist(i, j-1) + (*dist_funct)(0, y[j-1], 0, 0)
228
+ : INT_MAX;
229
+ d4 = i>1 && j>0 ? /* contraction */
230
+ dist(i-2, j-1) + (*dist_funct)(x[i-2], y[j-1], x[i-1], 0)
231
+ : INT_MAX;
232
+ d5 = i>0 && j>1 ? /* expansion */
233
+ dist(i-1, j-2) + (*dist_funct)(x[i-1], y[j-2], 0, y[j-1])
234
+ : INT_MAX;
235
+ d6 = i>1 && j>1 ? /* melding */
236
+ dist(i-2, j-2) + (*dist_funct)(x[i-2], y[j-2], x[i-1], y[j-1])
237
+ : INT_MAX;
238
+
239
+ dmin = d1;
240
+ if (d2<dmin) dmin=d2;
241
+ if (d3<dmin) dmin=d3;
242
+ if (d4<dmin) dmin=d4;
243
+ if (d5<dmin) dmin=d5;
244
+ if (d6<dmin) dmin=d6;
245
+
246
+ if (dmin == INT_MAX) {
247
+ dist(i,j) = 0;
248
+ }
249
+ else if (dmin == d1) {
250
+ dist(i,j) = d1;
251
+ pathx(i,j) = i-1;
252
+ pathy(i,j) = j-1;
253
+ }
254
+ else if (dmin == d2) {
255
+ dist(i,j) = d2;
256
+ pathx(i,j) = i-1;
257
+ pathy(i,j) = j;
258
+ }
259
+ else if (dmin == d3) {
260
+ dist(i,j) = d3;
261
+ pathx(i,j) = i;
262
+ pathy(i,j) = j-1;
263
+ }
264
+ else if (dmin == d4) {
265
+ dist(i,j) = d4;
266
+ pathx(i,j) = i-2;
267
+ pathy(i,j) = j-1;
268
+ }
269
+ else if (dmin == d5){
270
+ dist(i,j) = d5;
271
+ pathx(i,j) = i-1;
272
+ pathy(i,j) = j-2;
273
+ }
274
+ else /* dmin == d6 */ {
275
+ dist(i,j) = d6;
276
+ pathx(i,j) = i-2;
277
+ pathy(i,j) = j-2;
278
+ }
279
+ }
280
+ }
281
+
282
+ n = 0;
283
+ for (i=nx, j=ny ; i>0 || j>0 ; i = oi, j = oj) {
284
+ oi = pathx(i, j);
285
+ oj = pathy(i, j);
286
+ di = i - oi;
287
+ dj = j - oj;
288
+
289
+ if (di == 1 && dj == 1) { /* substitution */
290
+ ralign[n].x1 = x[i-1];
291
+ ralign[n].y1 = y[j-1];
292
+ ralign[n].x2 = 0;
293
+ ralign[n].y2 = 0;
294
+ ralign[n++].d = dist(i, j) - dist(i-1, j-1);
295
+ }
296
+
297
+ else if (di == 1 && dj == 0) { /* deletion */
298
+ ralign[n].x1 = x[i-1];
299
+ ralign[n].y1 = 0;
300
+ ralign[n].x2 = 0;
301
+ ralign[n].y2 = 0;
302
+ ralign[n++].d = dist(i, j) - dist(i-1, j);
303
+ }
304
+
305
+ else if (di == 0 && dj == 1) { /* insertion */
306
+ ralign[n].x1 = 0;
307
+ ralign[n].y1 = y[j-1];
308
+ ralign[n].x2 = 0;
309
+ ralign[n].y2 = 0;
310
+ ralign[n++].d = dist(i, j) - dist(i, j-1);
311
+ }
312
+
313
+ else if (dj == 1) { /* contraction */
314
+ ralign[n].x1 = x[i-2];
315
+ ralign[n].y1 = y[j-1];
316
+ ralign[n].x2 = x[i-1];
317
+ ralign[n].y2 = 0;
318
+ ralign[n++].d = dist(i, j) - dist(i-2, j-1);
319
+ }
320
+
321
+ else if (di == 1) { /* expansion */
322
+ ralign[n].x1 = x[i-1];
323
+ ralign[n].y1 = y[j-2];
324
+ ralign[n].x2 = 0;
325
+ ralign[n].y2 = y[j-1];
326
+ ralign[n++].d = dist(i, j) - dist(i-1, j-2);
327
+ }
328
+ else /* di == 2 && dj == 2 */ { /* melding */
329
+ ralign[n].x1 = x[i-2];
330
+ ralign[n].y1 = y[j-2];
331
+ ralign[n].x2 = x[i-1];
332
+ ralign[n].y2 = y[j-1];
333
+ ralign[n++].d = dist(i, j) - dist(i-2, j-2);
334
+ }
335
+ }
336
+
337
+ *align = (struct alignment *) malloc(n * sizeof(struct alignment));
338
+
339
+ for (i=0; i<n; i++)
340
+ bcopy(ralign + i, (*align) + (n-i-1), sizeof(struct alignment));
341
+
342
+ free(distances);
343
+ free(path_x);
344
+ free(path_y);
345
+ free(ralign);
346
+ return(n);
347
+ }
348
+
349
+ /* Returns the area under a normal distribution
350
+ from -inf to z standard deviations */
351
+ static double pnorm(double z)
352
+ {
353
+ double t, pd;
354
+ t = 1/(1 + 0.2316419 * z);
355
+ pd = 1 - 0.3989423 *
356
+ exp(-z * z/2) *
357
+ ((((1.330274429 * t - 1.821255978) * t
358
+ + 1.781477937) * t - 0.356563782) * t + 0.319381530) * t;
359
+ /* see Gradsteyn & Rhyzik, 26.2.17 p932 */
360
+ return(pd);
361
+ }
362
+
363
+
364
+ /* Return -100 * log probability that an English sentence of length
365
+ len1 is a translation of a foreign sentence of length len2. The
366
+ probability is based on two parameters, the mean and variance of
367
+ number of foreign characters per English character.
368
+ */
369
+
370
+ static int match(int len1,int len2)
371
+ {
372
+ double z, pd, mean;
373
+
374
+ /* foreign characters per english character */
375
+ double foreign_chars_per_eng_char = 1;
376
+
377
+ /* variance per english character */
378
+ double var_per_eng_char = 6.8 ;
379
+
380
+ if (len1==0 && len2==0) return(0);
381
+ mean = (len1 + len2/foreign_chars_per_eng_char)/2;
382
+ z = (foreign_chars_per_eng_char * len1 - len2)/sqrt(var_per_eng_char * mean);
383
+
384
+ /* Need to deal with both sides of the normal distribution */
385
+ if (z < 0) z = -z;
386
+ pd = 2 * (1 - pnorm(z));
387
+
388
+ if (pd > 0) return((int)(-100 * log(pd)));
389
+ else return(BIG_DISTANCE);
390
+ }
391
+
392
+ static int two_side_distance(int x1, int y1, int x2, int y2)
393
+ {
394
+ int penalty21 = 230; /* -100 * log([prob of 2-1 match] / [prob of 1-1 match]) */
395
+ int penalty22 = 440; /* -100 * log([prob of 2-2 match] / [prob of 1-1 match]) */
396
+ int penalty01 = 450; /* -100 * log([prob of 0-1 match] / [prob of 1-1 match]) */
397
+
398
+ if (x2 == 0 && y2 == 0)
399
+ if (x1 == 0) /* insertion */
400
+ return(match(x1, y1) + penalty01);
401
+ else if (y1 == 0) /* deletion */
402
+ return(match(x1, y1) + penalty01);
403
+ else
404
+ return (match(x1, y1)); /* substitution */
405
+ else if (x2 == 0) /* expansion */
406
+ return (match(x1, y1 + y2) + penalty21);
407
+ else if (y2 == 0) /* contraction */
408
+ return(match(x1 + x2, y1) + penalty21);
409
+ else /* melding */
410
+ return(match(x1 + x2, y1 + y2) + penalty22);
411
+ }
data/lib/.gitignore ADDED
@@ -0,0 +1 @@
1
+ *.so
data/lib/alignment.rb ADDED
@@ -0,0 +1,86 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # alignment.rb - Alignment functions
4
+ #
5
+ # Written by Marius L. Jøhndal <mariuslj at ifi.uio.no>, 2008
6
+ #
7
+ require 'gale_church'
8
+
9
+ module Alignment
10
+ def self.is_alignable?(o)
11
+ o.responds_to?(:weight)
12
+ end
13
+
14
+ # Represents an alignment of two regions, one +left+ region and one
15
+ # +right+ region. Both regions are arrays of alignable objects.
16
+ AlignedRegions = Struct.new(:left, :right)
17
+
18
+ class AlignedRegions
19
+ def to_s
20
+ "<#{self.left},#{self.right}>"
21
+ end
22
+ end
23
+
24
+ public
25
+
26
+ # Aligns two arrays of regions +a+ and +b+. Returns an array of
27
+ # +AlignedRegions+.
28
+ def self.align_regions(a, b, method = :gale_church)
29
+ case method
30
+ when :gale_church
31
+ x, y = Alignment::GaleChurch::align(a.collect(&:weight), b.collect(&:weight))
32
+ raise "Error aligning regions: returned block count does not match" unless x.length == y.length
33
+
34
+ x.zip(y).collect { |r, s| AlignedRegions.new(r.collect { |i| a[i] }, s.collect { |i| b[i] }) }
35
+ else
36
+ raise ArgumentError.new("invalid method #{method}")
37
+ end
38
+ end
39
+
40
+ class AlignableText
41
+ attr_reader :text
42
+
43
+ def initialize(text)
44
+ @text = text
45
+ end
46
+
47
+ def weight
48
+ # FIXME: character length
49
+ @text.length
50
+ # FIXME: word count
51
+ @text.split(' ').length
52
+ end
53
+
54
+ def to_s
55
+ @text
56
+ end
57
+ end
58
+
59
+ ANCHOR_REGEXP = /\s*\|\|\s*/
60
+ BOUNDARY_REGEXP = /\s*\|\s*/
61
+
62
+ # Aligns to strings of text +a+ and +b+ using method +method+. The boundaries of the
63
+ # blocks to be aligned are denoted by a | character (with or without surrounding
64
+ # white-space). The sequence || denotes an anchor (or hard delimiter), i.e. a fixed
65
+ # synchronisation point. There must be the same number of anchors in both strings,
66
+ # but the number of boundaries may be different. The returned value is an array
67
+ # of pairs of aligned strings.
68
+ def self.align_text(a, b, method = :gale_church)
69
+ result = []
70
+
71
+ regions_a = a.split(ANCHOR_REGEXP)
72
+ regions_b = b.split(ANCHOR_REGEXP)
73
+
74
+ raise ArgumentError.new("different number of anchors in strings") unless regions_a.length == regions_b.length
75
+
76
+ regions_a.zip(regions_b).each do |x, y|
77
+ r = align_regions(x.split(BOUNDARY_REGEXP).collect { |region| AlignableText.new(region) },
78
+ y.split(BOUNDARY_REGEXP).collect { |region| AlignableText.new(region) },
79
+ method)
80
+ r.collect! { |i| [ i.left.collect(&:text).join(' '), i.right.collect(&:text).join(' ') ] }
81
+ result += r
82
+ end
83
+
84
+ result
85
+ end
86
+ end
@@ -0,0 +1,41 @@
1
+ require 'alignment'
2
+ require 'test/unit'
3
+
4
+ class AlignmentTestCase < Test::Unit::TestCase
5
+ def test_church_alignment
6
+ x = [ Alignment::AlignableText.new("foo") ]
7
+ y = [ Alignment::AlignableText.new("bar") ]
8
+ assert_equal "<foo,bar>", Alignment::align_regions(x, y).collect(&:to_s).join(",")
9
+
10
+ x = [ Alignment::AlignableText.new("foo") ]
11
+ y = [ Alignment::AlignableText.new("bar"), Alignment::AlignableText.new("baz") ]
12
+ assert_equal "<foo,barbaz>", Alignment::align_regions(x, y).collect(&:to_s).join(",")
13
+
14
+ x = [ ]
15
+ y = [ Alignment::AlignableText.new("bar") ]
16
+ assert_equal "<,bar>", Alignment::align_regions(x, y).collect(&:to_s).join(",")
17
+
18
+ x = [ ]
19
+ y = [ ]
20
+ assert_equal "", Alignment::align_regions(x, y).collect(&:to_s).join(",")
21
+
22
+ x = [ Alignment::AlignableText.new("foo"), Alignment::AlignableText.new("koo") ]
23
+ y = [ Alignment::AlignableText.new("bar"), Alignment::AlignableText.new("baz") ]
24
+ assert_equal "<foo,bar>,<koo,baz>", Alignment::align_regions(x, y).collect(&:to_s).join(",")
25
+ end
26
+
27
+ def test_church_text_alignment
28
+ assert_equal [["The quick brown fox", "Den kvikke brune reven"], ["jumps", "hopper"], ["over the lazy dog", "over den trege hunden"]],
29
+ Alignment::align_text("The quick brown fox | jumps | over the lazy dog",
30
+ "Den kvikke brune reven | hopper | over den trege hunden")
31
+
32
+ assert_equal [["The quick brown fox", "Den kvikke brune reven"], ["jumps", "hopper elegant"], ["over the lazy dog", "over den trege hunden"]],
33
+ Alignment::align_text("The quick brown fox | jumps | over the lazy dog",
34
+ "Den kvikke brune reven | hopper elegant | over den trege hunden")
35
+
36
+ assert_equal [["The quick brown fox", "Den kvikke brune reven"], ["jumps", "hopper"], ["over the lazy dog", "elegant over den trege hunden"]],
37
+ Alignment::align_text("The quick brown fox | jumps || over the lazy dog",
38
+ "Den kvikke brune reven | hopper || elegant | over den trege hunden")
39
+ end
40
+ end
41
+
metadata ADDED
@@ -0,0 +1,76 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: alignment
3
+ version: !ruby/object:Gem::Version
4
+ hash: 23
5
+ prerelease:
6
+ segments:
7
+ - 0
8
+ - 2
9
+ - 0
10
+ version: 0.2.0
11
+ platform: ruby
12
+ authors:
13
+ - "Marius L. J\xC3\xB8hndal"
14
+ autorequire:
15
+ bindir: bin
16
+ cert_chain: []
17
+
18
+ date: 2011-12-11 00:00:00 Z
19
+ dependencies: []
20
+
21
+ description: Alignment functions for corpus linguistics.
22
+ email: mariuslj (at) ifi [dot] uio (dot) no
23
+ executables: []
24
+
25
+ extensions:
26
+ - ext/gale_church/extconf.rb
27
+ extra_rdoc_files:
28
+ - README.rdoc
29
+ files:
30
+ - CHANGELOG
31
+ - Manifest
32
+ - README.rdoc
33
+ - Rakefile
34
+ - VERSION
35
+ - alignment.gemspec
36
+ - ext/gale_church/.gitignore
37
+ - ext/gale_church/extconf.rb
38
+ - ext/gale_church/gale_church.c
39
+ - lib/.gitignore
40
+ - lib/alignment.rb
41
+ - test/test_alignment.rb
42
+ homepage: http://github.com/mlj/alignment
43
+ licenses: []
44
+
45
+ post_install_message:
46
+ rdoc_options: []
47
+
48
+ require_paths:
49
+ - lib
50
+ required_ruby_version: !ruby/object:Gem::Requirement
51
+ none: false
52
+ requirements:
53
+ - - ">="
54
+ - !ruby/object:Gem::Version
55
+ hash: 3
56
+ segments:
57
+ - 0
58
+ version: "0"
59
+ required_rubygems_version: !ruby/object:Gem::Requirement
60
+ none: false
61
+ requirements:
62
+ - - ">="
63
+ - !ruby/object:Gem::Version
64
+ hash: 3
65
+ segments:
66
+ - 0
67
+ version: "0"
68
+ requirements: []
69
+
70
+ rubyforge_project: alignment
71
+ rubygems_version: 1.8.11
72
+ signing_key:
73
+ specification_version: 3
74
+ summary: Alignment functions for corpus linguistics
75
+ test_files: []
76
+