alignment 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/CHANGELOG ADDED
@@ -0,0 +1 @@
1
+ v0.1.0. initial release
data/Manifest ADDED
@@ -0,0 +1,8 @@
1
+ README.rdoc
2
+ Rakefile
3
+ test/test_alignment.rb
4
+ CHANGELOG
5
+ ext/gale_church/gale_church.c
6
+ ext/gale_church/extconf.rb
7
+ lib/alignment.rb
8
+ Manifest
data/README.rdoc ADDED
@@ -0,0 +1,26 @@
1
+ = alignment
2
+
3
+ Alignment functions for corpus linguistics. Currently implemented
4
+ is the Gale-Church alignment algorithm.
5
+
6
+ == Installation
7
+
8
+ gem install alignment
9
+
10
+ == License
11
+
12
+ GPL2
13
+
14
+ The Gale-Church implementation was derived from NATools.
15
+ Implementation is copyright Pernilla Danielsson, Daniel Ridings,
16
+ and Alberto Simões. Algorithm is copyright William Gale and
17
+ Kenneth Church.
18
+
19
+ Ruby version is copyright Marius L. Jøhndal.
20
+
21
+ == Usage
22
+
23
+
24
+ == Development
25
+
26
+ The project is hosted on github on http://github.com/mlj/alignment.
data/Rakefile ADDED
@@ -0,0 +1,16 @@
1
+ # -*- encoding: utf-8 -*-
2
+ begin
3
+ require 'jeweler'
4
+
5
+ Jeweler::Tasks.new do |p|
6
+ p.name = "alignment"
7
+ p.summary = "Alignment functions for corpus linguistics"
8
+ p.description = "Alignment functions for corpus linguistics."
9
+ p.authors = ['Marius L. Jøhndal']
10
+ p.email = "mariuslj (at) ifi [dot] uio (dot) no"
11
+ p.homepage = "http://github.com/mlj/alignment"
12
+ p.rubyforge_project = "alignment"
13
+ end
14
+ rescue LoadError
15
+ puts "Jeweler not available. Install it with: sudo gem install jeweler"
16
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.2.0
data/alignment.gemspec ADDED
@@ -0,0 +1,47 @@
1
+ # Generated by jeweler
2
+ # DO NOT EDIT THIS FILE DIRECTLY
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
4
+ # -*- encoding: utf-8 -*-
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = "alignment"
8
+ s.version = "0.2.0"
9
+
10
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
+ s.authors = ["Marius L. J\303\270hndal"]
12
+ s.date = "2011-12-03"
13
+ s.description = "Alignment functions for corpus linguistics."
14
+ s.email = "mariuslj (at) ifi [dot] uio (dot) no"
15
+ s.extensions = ["ext/gale_church/extconf.rb"]
16
+ s.extra_rdoc_files = [
17
+ "README.rdoc"
18
+ ]
19
+ s.files = [
20
+ "CHANGELOG",
21
+ "Manifest",
22
+ "README.rdoc",
23
+ "Rakefile",
24
+ "VERSION",
25
+ "ext/gale_church/.gitignore",
26
+ "ext/gale_church/extconf.rb",
27
+ "ext/gale_church/gale_church.c",
28
+ "lib/.gitignore",
29
+ "lib/alignment.rb",
30
+ "test/test_alignment.rb"
31
+ ]
32
+ s.homepage = "http://github.com/mlj/alignment"
33
+ s.require_paths = ["lib"]
34
+ s.rubyforge_project = "alignment"
35
+ s.rubygems_version = "1.8.11"
36
+ s.summary = "Alignment functions for corpus linguistics"
37
+
38
+ if s.respond_to? :specification_version then
39
+ s.specification_version = 3
40
+
41
+ if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
42
+ else
43
+ end
44
+ else
45
+ end
46
+ end
47
+
@@ -0,0 +1,3 @@
1
+ Makefile
2
+ *.o
3
+ *.so
@@ -0,0 +1,5 @@
1
+ require 'mkmf'
2
+ extension_name = 'gale_church'
3
+ dir_config(extension_name)
4
+ $CFLAGS='-Wall'
5
+ create_makefile(extension_name)
@@ -0,0 +1,411 @@
1
+ /* NATools - Package with parallel corpora tools
2
+ * Original Implementation Copyright:
3
+ * Pernilla Danielsson and Daniel Ridings
4
+ * Algorithm Copyright:
5
+ * William Gale and Kenneth Church
6
+ * Copyright (C) 2002-2004 Alberto Sim�es
7
+
8
+ * Ruby adaptation:
9
+ * Copyright (C) 2008 Marius L. J�hndal
10
+ *
11
+ * This package is free software; you can redistribute it and/or
12
+ * modify it under the terms of the GNU Lesser General Public
13
+ * License as published by the Free Software Foundation; either
14
+ * version 2 of the License, or (at your option) any later version.
15
+ *
16
+ * This library is distributed in the hope that it will be useful,
17
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
18
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19
+ * Lesser General Public License for more details.
20
+ *
21
+ * You should have received a copy of the GNU Lesser General Public
22
+ * License along with this library; if not, write to the
23
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24
+ * Boston, MA 02111-1307, USA.
25
+ */
26
+
27
+ #include "ruby.h"
28
+ #include <assert.h>
29
+ #include <math.h>
30
+ #include <stdio.h>
31
+ #include <fcntl.h>
32
+ #include <stdlib.h>
33
+ #include <unistd.h>
34
+ #include <string.h>
35
+
36
+ #ifndef RSTRING_PTR /* Ruby 1.8 compatibility */
37
+ #define RARRAY_LEN(ptr) RARRAY(ptr)->len
38
+ #endif
39
+
40
+ VALUE mLogos = Qnil;
41
+ VALUE mGaleChurch = Qnil;
42
+
43
+ void Init_gale_church();
44
+
45
+ VALUE method_align(VALUE self, VALUE len1, VALUE len2);
46
+
47
+ struct alignment {
48
+ int x1;
49
+ int y1;
50
+ int x2;
51
+ int y2;
52
+ int d;
53
+ };
54
+
55
+ static int two_side_distance(int x1, int y1, int x2, int y2);
56
+ static int seq_align(int *x, int *y, int nx, int ny, int (*dist_funct)(),
57
+ struct alignment **align);
58
+
59
+ void Init_gale_church()
60
+ {
61
+ mLogos = rb_define_module("Alignment");
62
+ mGaleChurch = rb_define_module_under(mLogos, "GaleChurch");
63
+ rb_define_module_function(mGaleChurch, "align", method_align, 2);
64
+ }
65
+
66
+ VALUE method_align(VALUE self, VALUE len1, VALUE len2)
67
+ {
68
+ VALUE ret, res1, res2, grouped_regions1, grouped_regions2;
69
+
70
+ int n, i, ix, iy, prevx, prevy;
71
+ struct alignment *align, *a;
72
+ int *len1_x;
73
+ int *len2_x;
74
+
75
+ res1 = rb_ary_new();
76
+ res2 = rb_ary_new();
77
+
78
+ len1_x = (int *)malloc(RARRAY_LEN(len1) * sizeof(int));
79
+ len2_x = (int *)malloc(RARRAY_LEN(len2) * sizeof(int));
80
+
81
+ for (i = 0; i < RARRAY_LEN(len1); i++)
82
+ len1_x[i] = NUM2INT(rb_ary_entry(len1, i));
83
+
84
+ for (i = 0; i < RARRAY_LEN(len2); i++)
85
+ len2_x[i] = NUM2INT(rb_ary_entry(len2, i));
86
+
87
+ n = seq_align(len1_x, len2_x, RARRAY_LEN(len1), RARRAY_LEN(len2),
88
+ two_side_distance, &align);
89
+
90
+ free(len1_x);
91
+ free(len2_x);
92
+
93
+ prevx = prevy = ix = iy = 0;
94
+
95
+ for (i = 0; i < n; i++) {
96
+ a = &align[i];
97
+
98
+ if (a->x2 > 0)
99
+ ix++;
100
+ else if (a->x1 == 0)
101
+ ix--;
102
+
103
+ if (a->y2 > 0)
104
+ iy++;
105
+ else if (a->y1 == 0)
106
+ iy--;
107
+
108
+ if (a->x1 == 0 && a->y1 == 0 && a->x2 == 0 && a->y2 == 0) {
109
+ ix++;
110
+ iy++;
111
+ }
112
+
113
+ ix++;
114
+ iy++;
115
+
116
+ grouped_regions1 = rb_ary_new();
117
+ for (; prevx < ix; prevx++)
118
+ rb_ary_push(grouped_regions1, INT2FIX(prevx));
119
+ rb_ary_push(res1, grouped_regions1);
120
+
121
+ grouped_regions2 = rb_ary_new();
122
+ for (; prevy < iy; prevy++)
123
+ rb_ary_push(grouped_regions2, INT2FIX(prevy));
124
+ rb_ary_push(res2, grouped_regions2);
125
+ }
126
+
127
+ free(align);
128
+
129
+ ret = rb_ary_new();
130
+ rb_ary_push(ret, res1);
131
+ rb_ary_push(ret, res2);
132
+
133
+ return ret;
134
+ }
135
+
136
+ /********************************************************************/
137
+
138
+ /**
139
+ * @file
140
+ * @brief Sentence-aligner main program
141
+ *
142
+ * Sentence-aligner software heavily based on Pernilla Danielsson and
143
+ * Daniel Ridings implementation of William Gale and Kenneth Church
144
+ * sentence aligner algorithm
145
+ *
146
+ * The compiled program is used as
147
+ * <pre>
148
+ * nat-sentence-aligner -D '.PARA' -d '.End of Sentence' file1 file2
149
+ * </pre>
150
+ * where both <i>file1</i> and <i>file2</i> are tokenized and with a
151
+ * token per line. In the example, '.PARA' is the hard delimiter,
152
+ * and '.End of Sentence' the soft delimiter. They are considered as
153
+ * single tokens, and as such, should appear in a line by themselves.
154
+ *
155
+ * The program is allowed to delete soft delimiters as necessary in
156
+ * order to align the files, but it cannot change hard delimiters.
157
+ *
158
+ * The output will be written in two files: file1.al and file2.al
159
+ * where each one contains aligned sentences.
160
+ *
161
+ * For debugging it can be useful to output the data in just one file.
162
+ * In this case, use the '-s' switch and just the 'file1.al' will be
163
+ * created.
164
+ *
165
+ * @todo Check if we want to document all the functions
166
+ */
167
+
168
+
169
+ #define dist(x,y) distances[(x) * ((ny) + 1) + (y)]
170
+ #define pathx(x,y) path_x[(x) * ((ny) + 1) + (y)]
171
+ #define pathy(x,y) path_y[(x) * ((ny) + 1) + (y)]
172
+
173
+ /**
174
+ * @brief Contant representing a big align distance between two
175
+ * sentences
176
+ */
177
+ #define BIG_DISTANCE 2500
178
+
179
+ /*
180
+
181
+ seq_align by Mike Riley
182
+ Sequence alignment routine.
183
+ This version allows for contraction/expansions.
184
+
185
+ x and y are sequences of objects, represented as non-zero ints, to be aligned.
186
+
187
+ dist_funct(x1, y1, x2, y2) is a distance function of 4 args:
188
+
189
+ dist_funct(x1, y1, 0, 0) gives cost of substitution of x1 by y1.
190
+ dist_funct(x1, 0, 0, 0) gives cost of deletion of x1.
191
+ dist_funct(0, y1, 0, 0) gives cost of insertion of y1.
192
+ dist_funct(x1, y1, x2, 0) gives cost of contraction of (x1,x2) to y1.
193
+ dist_funct(x1, y1, 0, y2) gives cost of expansion of x1 to (y1,y2).
194
+ dist_funct(x1, y1, x2, y2) gives cost to match (x1,x2) to (y1,y2).
195
+
196
+ align is the alignment, with (align[i].x1, align[i].x2) aligned
197
+ with (align[i].y1, align[i].y2). Zero in align[].x1 and align[].y1
198
+ correspond to insertion and deletion, respectively. Non-zero in
199
+ align[].x2 and align[].y2 correspond to contraction and expansion,
200
+ respectively. align[].d gives the distance for that pairing.
201
+
202
+ The function returns the length of the alignment.
203
+
204
+ */
205
+
206
+ static int seq_align(int *x, int *y, int nx, int ny, int (*dist_funct)(),
207
+ struct alignment **align)
208
+ {
209
+ int *distances, *path_x, *path_y, n;
210
+ int i, j, oi, oj, di, dj, d1, d2, d3, d4, d5, d6, dmin;
211
+ struct alignment *ralign;
212
+
213
+ distances = (int *) malloc((nx + 1) * (ny + 1) * sizeof(int));
214
+ path_x = (int *) malloc((nx + 1) * (ny + 1) * sizeof(int));
215
+ path_y = (int *) malloc((nx + 1) * (ny + 1) * sizeof(int));
216
+ ralign = (struct alignment *) malloc((nx + ny) * sizeof(struct alignment));
217
+
218
+ for (j = 0; j <= ny; j++) {
219
+ for (i = 0; i <= nx; i++) {
220
+ d1 = i>0 && j>0 ? /* substitution */
221
+ dist(i-1, j-1) + (*dist_funct)(x[i-1], y[j-1], 0, 0)
222
+ : INT_MAX;
223
+ d2 = i>0 ? /* deletion */
224
+ dist(i-1, j) + (*dist_funct)(x[i-1], 0, 0, 0)
225
+ : INT_MAX;
226
+ d3 = j>0 ? /* insertion */
227
+ dist(i, j-1) + (*dist_funct)(0, y[j-1], 0, 0)
228
+ : INT_MAX;
229
+ d4 = i>1 && j>0 ? /* contraction */
230
+ dist(i-2, j-1) + (*dist_funct)(x[i-2], y[j-1], x[i-1], 0)
231
+ : INT_MAX;
232
+ d5 = i>0 && j>1 ? /* expansion */
233
+ dist(i-1, j-2) + (*dist_funct)(x[i-1], y[j-2], 0, y[j-1])
234
+ : INT_MAX;
235
+ d6 = i>1 && j>1 ? /* melding */
236
+ dist(i-2, j-2) + (*dist_funct)(x[i-2], y[j-2], x[i-1], y[j-1])
237
+ : INT_MAX;
238
+
239
+ dmin = d1;
240
+ if (d2<dmin) dmin=d2;
241
+ if (d3<dmin) dmin=d3;
242
+ if (d4<dmin) dmin=d4;
243
+ if (d5<dmin) dmin=d5;
244
+ if (d6<dmin) dmin=d6;
245
+
246
+ if (dmin == INT_MAX) {
247
+ dist(i,j) = 0;
248
+ }
249
+ else if (dmin == d1) {
250
+ dist(i,j) = d1;
251
+ pathx(i,j) = i-1;
252
+ pathy(i,j) = j-1;
253
+ }
254
+ else if (dmin == d2) {
255
+ dist(i,j) = d2;
256
+ pathx(i,j) = i-1;
257
+ pathy(i,j) = j;
258
+ }
259
+ else if (dmin == d3) {
260
+ dist(i,j) = d3;
261
+ pathx(i,j) = i;
262
+ pathy(i,j) = j-1;
263
+ }
264
+ else if (dmin == d4) {
265
+ dist(i,j) = d4;
266
+ pathx(i,j) = i-2;
267
+ pathy(i,j) = j-1;
268
+ }
269
+ else if (dmin == d5){
270
+ dist(i,j) = d5;
271
+ pathx(i,j) = i-1;
272
+ pathy(i,j) = j-2;
273
+ }
274
+ else /* dmin == d6 */ {
275
+ dist(i,j) = d6;
276
+ pathx(i,j) = i-2;
277
+ pathy(i,j) = j-2;
278
+ }
279
+ }
280
+ }
281
+
282
+ n = 0;
283
+ for (i=nx, j=ny ; i>0 || j>0 ; i = oi, j = oj) {
284
+ oi = pathx(i, j);
285
+ oj = pathy(i, j);
286
+ di = i - oi;
287
+ dj = j - oj;
288
+
289
+ if (di == 1 && dj == 1) { /* substitution */
290
+ ralign[n].x1 = x[i-1];
291
+ ralign[n].y1 = y[j-1];
292
+ ralign[n].x2 = 0;
293
+ ralign[n].y2 = 0;
294
+ ralign[n++].d = dist(i, j) - dist(i-1, j-1);
295
+ }
296
+
297
+ else if (di == 1 && dj == 0) { /* deletion */
298
+ ralign[n].x1 = x[i-1];
299
+ ralign[n].y1 = 0;
300
+ ralign[n].x2 = 0;
301
+ ralign[n].y2 = 0;
302
+ ralign[n++].d = dist(i, j) - dist(i-1, j);
303
+ }
304
+
305
+ else if (di == 0 && dj == 1) { /* insertion */
306
+ ralign[n].x1 = 0;
307
+ ralign[n].y1 = y[j-1];
308
+ ralign[n].x2 = 0;
309
+ ralign[n].y2 = 0;
310
+ ralign[n++].d = dist(i, j) - dist(i, j-1);
311
+ }
312
+
313
+ else if (dj == 1) { /* contraction */
314
+ ralign[n].x1 = x[i-2];
315
+ ralign[n].y1 = y[j-1];
316
+ ralign[n].x2 = x[i-1];
317
+ ralign[n].y2 = 0;
318
+ ralign[n++].d = dist(i, j) - dist(i-2, j-1);
319
+ }
320
+
321
+ else if (di == 1) { /* expansion */
322
+ ralign[n].x1 = x[i-1];
323
+ ralign[n].y1 = y[j-2];
324
+ ralign[n].x2 = 0;
325
+ ralign[n].y2 = y[j-1];
326
+ ralign[n++].d = dist(i, j) - dist(i-1, j-2);
327
+ }
328
+ else /* di == 2 && dj == 2 */ { /* melding */
329
+ ralign[n].x1 = x[i-2];
330
+ ralign[n].y1 = y[j-2];
331
+ ralign[n].x2 = x[i-1];
332
+ ralign[n].y2 = y[j-1];
333
+ ralign[n++].d = dist(i, j) - dist(i-2, j-2);
334
+ }
335
+ }
336
+
337
+ *align = (struct alignment *) malloc(n * sizeof(struct alignment));
338
+
339
+ for (i=0; i<n; i++)
340
+ bcopy(ralign + i, (*align) + (n-i-1), sizeof(struct alignment));
341
+
342
+ free(distances);
343
+ free(path_x);
344
+ free(path_y);
345
+ free(ralign);
346
+ return(n);
347
+ }
348
+
349
+ /* Returns the area under a normal distribution
350
+ from -inf to z standard deviations */
351
+ static double pnorm(double z)
352
+ {
353
+ double t, pd;
354
+ t = 1/(1 + 0.2316419 * z);
355
+ pd = 1 - 0.3989423 *
356
+ exp(-z * z/2) *
357
+ ((((1.330274429 * t - 1.821255978) * t
358
+ + 1.781477937) * t - 0.356563782) * t + 0.319381530) * t;
359
+ /* see Gradsteyn & Rhyzik, 26.2.17 p932 */
360
+ return(pd);
361
+ }
362
+
363
+
364
+ /* Return -100 * log probability that an English sentence of length
365
+ len1 is a translation of a foreign sentence of length len2. The
366
+ probability is based on two parameters, the mean and variance of
367
+ number of foreign characters per English character.
368
+ */
369
+
370
+ static int match(int len1,int len2)
371
+ {
372
+ double z, pd, mean;
373
+
374
+ /* foreign characters per english character */
375
+ double foreign_chars_per_eng_char = 1;
376
+
377
+ /* variance per english character */
378
+ double var_per_eng_char = 6.8 ;
379
+
380
+ if (len1==0 && len2==0) return(0);
381
+ mean = (len1 + len2/foreign_chars_per_eng_char)/2;
382
+ z = (foreign_chars_per_eng_char * len1 - len2)/sqrt(var_per_eng_char * mean);
383
+
384
+ /* Need to deal with both sides of the normal distribution */
385
+ if (z < 0) z = -z;
386
+ pd = 2 * (1 - pnorm(z));
387
+
388
+ if (pd > 0) return((int)(-100 * log(pd)));
389
+ else return(BIG_DISTANCE);
390
+ }
391
+
392
+ static int two_side_distance(int x1, int y1, int x2, int y2)
393
+ {
394
+ int penalty21 = 230; /* -100 * log([prob of 2-1 match] / [prob of 1-1 match]) */
395
+ int penalty22 = 440; /* -100 * log([prob of 2-2 match] / [prob of 1-1 match]) */
396
+ int penalty01 = 450; /* -100 * log([prob of 0-1 match] / [prob of 1-1 match]) */
397
+
398
+ if (x2 == 0 && y2 == 0)
399
+ if (x1 == 0) /* insertion */
400
+ return(match(x1, y1) + penalty01);
401
+ else if (y1 == 0) /* deletion */
402
+ return(match(x1, y1) + penalty01);
403
+ else
404
+ return (match(x1, y1)); /* substitution */
405
+ else if (x2 == 0) /* expansion */
406
+ return (match(x1, y1 + y2) + penalty21);
407
+ else if (y2 == 0) /* contraction */
408
+ return(match(x1 + x2, y1) + penalty21);
409
+ else /* melding */
410
+ return(match(x1 + x2, y1 + y2) + penalty22);
411
+ }
data/lib/.gitignore ADDED
@@ -0,0 +1 @@
1
+ *.so
data/lib/alignment.rb ADDED
@@ -0,0 +1,86 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # alignment.rb - Alignment functions
4
+ #
5
+ # Written by Marius L. Jøhndal <mariuslj at ifi.uio.no>, 2008
6
+ #
7
+ require 'gale_church'
8
+
9
+ module Alignment
10
+ def self.is_alignable?(o)
11
+ o.responds_to?(:weight)
12
+ end
13
+
14
+ # Represents an alignment of two regions, one +left+ region and one
15
+ # +right+ region. Both regions are arrays of alignable objects.
16
+ AlignedRegions = Struct.new(:left, :right)
17
+
18
+ class AlignedRegions
19
+ def to_s
20
+ "<#{self.left},#{self.right}>"
21
+ end
22
+ end
23
+
24
+ public
25
+
26
+ # Aligns two arrays of regions +a+ and +b+. Returns an array of
27
+ # +AlignedRegions+.
28
+ def self.align_regions(a, b, method = :gale_church)
29
+ case method
30
+ when :gale_church
31
+ x, y = Alignment::GaleChurch::align(a.collect(&:weight), b.collect(&:weight))
32
+ raise "Error aligning regions: returned block count does not match" unless x.length == y.length
33
+
34
+ x.zip(y).collect { |r, s| AlignedRegions.new(r.collect { |i| a[i] }, s.collect { |i| b[i] }) }
35
+ else
36
+ raise ArgumentError.new("invalid method #{method}")
37
+ end
38
+ end
39
+
40
+ class AlignableText
41
+ attr_reader :text
42
+
43
+ def initialize(text)
44
+ @text = text
45
+ end
46
+
47
+ def weight
48
+ # FIXME: character length
49
+ @text.length
50
+ # FIXME: word count
51
+ @text.split(' ').length
52
+ end
53
+
54
+ def to_s
55
+ @text
56
+ end
57
+ end
58
+
59
+ ANCHOR_REGEXP = /\s*\|\|\s*/
60
+ BOUNDARY_REGEXP = /\s*\|\s*/
61
+
62
+ # Aligns to strings of text +a+ and +b+ using method +method+. The boundaries of the
63
+ # blocks to be aligned are denoted by a | character (with or without surrounding
64
+ # white-space). The sequence || denotes an anchor (or hard delimiter), i.e. a fixed
65
+ # synchronisation point. There must be the same number of anchors in both strings,
66
+ # but the number of boundaries may be different. The returned value is an array
67
+ # of pairs of aligned strings.
68
+ def self.align_text(a, b, method = :gale_church)
69
+ result = []
70
+
71
+ regions_a = a.split(ANCHOR_REGEXP)
72
+ regions_b = b.split(ANCHOR_REGEXP)
73
+
74
+ raise ArgumentError.new("different number of anchors in strings") unless regions_a.length == regions_b.length
75
+
76
+ regions_a.zip(regions_b).each do |x, y|
77
+ r = align_regions(x.split(BOUNDARY_REGEXP).collect { |region| AlignableText.new(region) },
78
+ y.split(BOUNDARY_REGEXP).collect { |region| AlignableText.new(region) },
79
+ method)
80
+ r.collect! { |i| [ i.left.collect(&:text).join(' '), i.right.collect(&:text).join(' ') ] }
81
+ result += r
82
+ end
83
+
84
+ result
85
+ end
86
+ end
@@ -0,0 +1,41 @@
1
+ require 'alignment'
2
+ require 'test/unit'
3
+
4
+ class AlignmentTestCase < Test::Unit::TestCase
5
+ def test_church_alignment
6
+ x = [ Alignment::AlignableText.new("foo") ]
7
+ y = [ Alignment::AlignableText.new("bar") ]
8
+ assert_equal "<foo,bar>", Alignment::align_regions(x, y).collect(&:to_s).join(",")
9
+
10
+ x = [ Alignment::AlignableText.new("foo") ]
11
+ y = [ Alignment::AlignableText.new("bar"), Alignment::AlignableText.new("baz") ]
12
+ assert_equal "<foo,barbaz>", Alignment::align_regions(x, y).collect(&:to_s).join(",")
13
+
14
+ x = [ ]
15
+ y = [ Alignment::AlignableText.new("bar") ]
16
+ assert_equal "<,bar>", Alignment::align_regions(x, y).collect(&:to_s).join(",")
17
+
18
+ x = [ ]
19
+ y = [ ]
20
+ assert_equal "", Alignment::align_regions(x, y).collect(&:to_s).join(",")
21
+
22
+ x = [ Alignment::AlignableText.new("foo"), Alignment::AlignableText.new("koo") ]
23
+ y = [ Alignment::AlignableText.new("bar"), Alignment::AlignableText.new("baz") ]
24
+ assert_equal "<foo,bar>,<koo,baz>", Alignment::align_regions(x, y).collect(&:to_s).join(",")
25
+ end
26
+
27
+ def test_church_text_alignment
28
+ assert_equal [["The quick brown fox", "Den kvikke brune reven"], ["jumps", "hopper"], ["over the lazy dog", "over den trege hunden"]],
29
+ Alignment::align_text("The quick brown fox | jumps | over the lazy dog",
30
+ "Den kvikke brune reven | hopper | over den trege hunden")
31
+
32
+ assert_equal [["The quick brown fox", "Den kvikke brune reven"], ["jumps", "hopper elegant"], ["over the lazy dog", "over den trege hunden"]],
33
+ Alignment::align_text("The quick brown fox | jumps | over the lazy dog",
34
+ "Den kvikke brune reven | hopper elegant | over den trege hunden")
35
+
36
+ assert_equal [["The quick brown fox", "Den kvikke brune reven"], ["jumps", "hopper"], ["over the lazy dog", "elegant over den trege hunden"]],
37
+ Alignment::align_text("The quick brown fox | jumps || over the lazy dog",
38
+ "Den kvikke brune reven | hopper || elegant | over den trege hunden")
39
+ end
40
+ end
41
+
metadata ADDED
@@ -0,0 +1,76 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: alignment
3
+ version: !ruby/object:Gem::Version
4
+ hash: 23
5
+ prerelease:
6
+ segments:
7
+ - 0
8
+ - 2
9
+ - 0
10
+ version: 0.2.0
11
+ platform: ruby
12
+ authors:
13
+ - "Marius L. J\xC3\xB8hndal"
14
+ autorequire:
15
+ bindir: bin
16
+ cert_chain: []
17
+
18
+ date: 2011-12-11 00:00:00 Z
19
+ dependencies: []
20
+
21
+ description: Alignment functions for corpus linguistics.
22
+ email: mariuslj (at) ifi [dot] uio (dot) no
23
+ executables: []
24
+
25
+ extensions:
26
+ - ext/gale_church/extconf.rb
27
+ extra_rdoc_files:
28
+ - README.rdoc
29
+ files:
30
+ - CHANGELOG
31
+ - Manifest
32
+ - README.rdoc
33
+ - Rakefile
34
+ - VERSION
35
+ - alignment.gemspec
36
+ - ext/gale_church/.gitignore
37
+ - ext/gale_church/extconf.rb
38
+ - ext/gale_church/gale_church.c
39
+ - lib/.gitignore
40
+ - lib/alignment.rb
41
+ - test/test_alignment.rb
42
+ homepage: http://github.com/mlj/alignment
43
+ licenses: []
44
+
45
+ post_install_message:
46
+ rdoc_options: []
47
+
48
+ require_paths:
49
+ - lib
50
+ required_ruby_version: !ruby/object:Gem::Requirement
51
+ none: false
52
+ requirements:
53
+ - - ">="
54
+ - !ruby/object:Gem::Version
55
+ hash: 3
56
+ segments:
57
+ - 0
58
+ version: "0"
59
+ required_rubygems_version: !ruby/object:Gem::Requirement
60
+ none: false
61
+ requirements:
62
+ - - ">="
63
+ - !ruby/object:Gem::Version
64
+ hash: 3
65
+ segments:
66
+ - 0
67
+ version: "0"
68
+ requirements: []
69
+
70
+ rubyforge_project: alignment
71
+ rubygems_version: 1.8.11
72
+ signing_key:
73
+ specification_version: 3
74
+ summary: Alignment functions for corpus linguistics
75
+ test_files: []
76
+