linkparser 1.0.3

Sign up to get free protection for your applications and to get access to all the features.
data/ext/sentence.c ADDED
@@ -0,0 +1,536 @@
1
+ /*
2
+ * sentence.c - Ruby LinkParser
3
+ * $Id: sentence.c 48 2008-12-19 18:30:33Z deveiant $
4
+ *
5
+ * Authors:
6
+ * * Michael Granger <ged@FaerieMUD.org>
7
+ *
8
+ * Please see the LICENSE file at the top of the distribution for licensing
9
+ * information.
10
+ */
11
+
12
+ #include "linkparser.h"
13
+
14
+
15
+ /* --------------------------------------------------
16
+ * Forward declarations
17
+ * -------------------------------------------------- */
18
+
19
+
20
+ /* --------------------------------------------------
21
+ * Macros and constants
22
+ * -------------------------------------------------- */
23
+
24
+
25
+ /* --------------------------------------------------
26
+ * Memory-management functions
27
+ * -------------------------------------------------- */
28
+
29
+ /*
30
+ * Allocation function
31
+ */
32
+ static rlink_SENTENCE *
33
+ rlink_sentence_alloc() {
34
+ rlink_SENTENCE *ptr = ALLOC( rlink_SENTENCE );
35
+
36
+ ptr->sentence = NULL;
37
+ ptr->dictionary = Qnil;
38
+ ptr->parsed_p = Qfalse;
39
+ ptr->options = Qnil;
40
+
41
+ debugMsg(( "Initialized an rlink_SENTENCE <%p>", ptr ));
42
+ return ptr;
43
+ }
44
+
45
+
46
+ /*
47
+ * GC Mark function
48
+ */
49
+ static void
50
+ rlink_sentence_gc_mark( rlink_SENTENCE *ptr ) {
51
+ debugMsg(( "Marking LinkParser::Sentence %p", ptr ));
52
+
53
+ if ( ptr ) {
54
+ rb_gc_mark( ptr->dictionary );
55
+ rb_gc_mark( ptr->options );
56
+ }
57
+
58
+ else {
59
+ debugMsg(( "Not marking uninitialized rlink_SENTENCE" ));
60
+ }
61
+ }
62
+
63
+
64
+ /*
65
+ * GC Free function
66
+ */
67
+ static void
68
+ rlink_sentence_gc_free( rlink_SENTENCE *ptr ) {
69
+ if ( ptr ) {
70
+ debugMsg(( "In free function of Sentence <%p>", ptr ));
71
+
72
+ if ( rlink_get_dict(ptr->dictionary) ) {
73
+ debugMsg(( "Freeing Sentence <%p>", ptr->sentence ));
74
+ sentence_delete( (Sentence)ptr->sentence );
75
+ } else {
76
+ debugMsg(( "Not freeing a Sentence belonging to an already-freed dictionary." ));
77
+ }
78
+
79
+ ptr->sentence = NULL;
80
+ ptr->options = Qnil;
81
+ ptr->dictionary = Qnil;
82
+ }
83
+
84
+ else {
85
+ debugMsg(( "Not freeing an uninitialized rlink_SENTENCE" ));
86
+ }
87
+ }
88
+
89
+
90
+ /*
91
+ * Object validity checker. Returns the data pointer.
92
+ */
93
+ static rlink_SENTENCE *
94
+ check_sentence( VALUE self ) {
95
+ Check_Type( self, T_DATA );
96
+
97
+ if ( !IsSentence(self) ) {
98
+ rb_raise( rb_eTypeError, "wrong argument type %s (expected LinkParser::Sentence)",
99
+ rb_class2name(CLASS_OF( self )) );
100
+ }
101
+
102
+ return DATA_PTR( self );
103
+ }
104
+
105
+
106
+ /*
107
+ * Fetch the data pointer and check it for sanity.
108
+ */
109
+ static rlink_SENTENCE *
110
+ get_sentence( VALUE self ) {
111
+ rlink_SENTENCE *ptr = check_sentence( self );
112
+
113
+ if ( !ptr )
114
+ rb_raise( rb_eRuntimeError, "uninitialized Sentence" );
115
+
116
+ return ptr;
117
+ }
118
+
119
+
120
+ /*
121
+ * Publicly-usable sentence-fetcher
122
+ */
123
+ rlink_SENTENCE *
124
+ rlink_get_sentence( VALUE self ) {
125
+ return get_sentence( self );
126
+ }
127
+
128
+
129
+
130
+ /* --------------------------------------------------
131
+ * Class Methods
132
+ * -------------------------------------------------- */
133
+
134
+ /*
135
+ * call-seq:
136
+ * LinkParser::Sentence.allocate -> sentence
137
+ *
138
+ * Allocate a new LinkParser::Sentence object.
139
+ *
140
+ */
141
+ static VALUE
142
+ rlink_sentence_s_alloc( VALUE klass ) {
143
+ debugMsg(( "Wrapping an uninitialized Sentence pointer." ));
144
+ return Data_Wrap_Struct( klass, rlink_sentence_gc_mark, rlink_sentence_gc_free, 0 );
145
+ }
146
+
147
+
148
+ /* --------------------
149
+ * Instance methods
150
+ * -------------------- */
151
+
152
+ /*
153
+ * call-seq:
154
+ * LinkParser::Sentence.new( str, dict ) -> sentence
155
+ *
156
+ * Create a new LinkParser::Sentence object from the given input string
157
+ # using the specified LinkParser::Dictionary.
158
+ *
159
+ * dict = LinkParser::Dictionary.new
160
+ * LinkParser::Sentence.new( "The boy runs", dict ) #=> #<LinkParser::Sentence:0x5481ac>
161
+ */
162
+ static VALUE
163
+ rlink_sentence_init( VALUE self, VALUE input_string, VALUE dictionary ) {
164
+ if ( !check_sentence(self) ) {
165
+ rlink_SENTENCE *ptr;
166
+ Sentence sent;
167
+ Dictionary dict = rlink_get_dict( dictionary );
168
+
169
+ if ( !(sent = sentence_create( StringValueCStr(input_string), dict )) )
170
+ rlink_raise_lp_error();
171
+
172
+ DATA_PTR( self ) = ptr = rlink_sentence_alloc();
173
+
174
+ ptr->sentence = sent;
175
+ ptr->dictionary = dictionary;
176
+ ptr->options = Qnil;
177
+
178
+ } else {
179
+ rb_raise( rb_eRuntimeError,
180
+ "Cannot re-initialize a sentence once it's been created." );
181
+ }
182
+
183
+ return self;
184
+ }
185
+
186
+
187
+ /*
188
+ * call-seq:
189
+ * sentence.parse( options={} ) -> fixnum
190
+ *
191
+ * Attach a parse set to this sentence and return the number of linkages
192
+ * found. If any +options+ are specified, they override those set in the
193
+ * sentence's dictionary.
194
+ *
195
+ */
196
+ static VALUE
197
+ rlink_sentence_parse( int argc, VALUE *argv, VALUE self ) {
198
+ rlink_SENTENCE *ptr = get_sentence( self );
199
+ Parse_Options opts;
200
+ VALUE defopts = Qnil;
201
+ VALUE options = Qnil;
202
+ int link_count = 0;
203
+
204
+ if ( RTEST(ptr->parsed_p) )
205
+ rb_raise( rlink_eLpError, "Can't reparse a sentence." );
206
+
207
+ /* Merge the hash from this call with the one from the dict and build
208
+ Parse_Options from it. */
209
+ rb_scan_args( argc, argv, "01", &options );
210
+ defopts = rb_funcall( ptr->dictionary, rb_intern("options"), 0 );
211
+
212
+ /* Turn the option hash into a ParseOptions object, then extract the
213
+ Parse_Options struct from that */
214
+ options = rlink_make_parse_options( defopts, options );
215
+ opts = rlink_get_parseopts( options );
216
+
217
+ /* Parse the sentence */
218
+ if ( (link_count = sentence_parse( ptr->sentence, opts )) < 0 )
219
+ rlink_raise_lp_error();
220
+
221
+ ptr->options = options;
222
+ ptr->parsed_p = Qtrue;
223
+
224
+ return INT2FIX( link_count );
225
+ }
226
+
227
+
228
+ /*
229
+ * call-seq:
230
+ * sentence.parsed? -> true or false
231
+ *
232
+ * Returns +true+ if the sentence has been parsed.
233
+ *
234
+ * sentence.parsed? #-> false
235
+ * sentence.parse #-> 6
236
+ * sentence.parsed? #-> true
237
+ */
238
+ static VALUE
239
+ rlink_sentence_parsed_p( VALUE self ) {
240
+ rlink_SENTENCE *ptr = get_sentence( self );
241
+ return ptr->parsed_p;
242
+ }
243
+
244
+
245
+ /*
246
+ * call-seq:
247
+ * sentence.options -> parseoptions
248
+ *
249
+ * Returns a ParseOptions object for the receiving sentence.
250
+ *
251
+ * sentence.options.verbosity = 3
252
+ * sentence.options.islands_ok? # -> true
253
+ */
254
+ static VALUE
255
+ rlink_sentence_options( VALUE self ) {
256
+ rlink_SENTENCE *ptr = get_sentence( self );
257
+ return ptr->options;
258
+ }
259
+
260
+
261
+
262
+
263
+ /*
264
+ * call-seq:
265
+ * sentence.linkages -> array
266
+ *
267
+ * Returns an Array of LinkParser::Linkage objects which represent the
268
+ * parts parsed from the sentence for the current linkage.
269
+ *
270
+ */
271
+ static VALUE
272
+ rlink_sentence_linkages( VALUE self ) {
273
+ rlink_SENTENCE *ptr = get_sentence( self );
274
+ int i, count = 0;
275
+ VALUE rary;
276
+
277
+ if ( !RTEST(ptr->parsed_p) )
278
+ rlink_sentence_parse( 0, 0, self );
279
+
280
+ count = sentence_num_valid_linkages( (Sentence)ptr->sentence );
281
+ rary = rb_ary_new2( count );
282
+
283
+ for ( i = 0; i < count; i++ ) {
284
+ VALUE linkage;
285
+ VALUE args[2];
286
+
287
+ args[0] = INT2FIX( i );
288
+ args[1] = self;
289
+
290
+ linkage = rb_class_new_instance( 2, args, rlink_cLinkage );
291
+ rb_ary_store( rary, i, linkage );
292
+ }
293
+
294
+ return rary;
295
+ }
296
+
297
+
298
+ /*
299
+ * call-seq:
300
+ * sentence.length -> fixnum
301
+ *
302
+ * Returns the number of words in the tokenized sentence, including the
303
+ * boundary words and punctuation.
304
+ *
305
+ */
306
+
307
+ static VALUE
308
+ rlink_sentence_length( VALUE self ) {
309
+ rlink_SENTENCE *ptr = get_sentence( self );
310
+ return INT2FIX( sentence_length((Sentence)ptr->sentence) );
311
+ }
312
+
313
+
314
+ /*
315
+ * call-seq:
316
+ * sentence.word( idx ) -> str
317
+ *
318
+ * Returns the spelling of the n-th word in the sentence as it appears after
319
+ * tokenization.
320
+ */
321
+ static VALUE
322
+ rlink_sentence_word( VALUE self, VALUE n ) {
323
+ rlink_SENTENCE *ptr = get_sentence( self );
324
+ char *word;
325
+
326
+ word = sentence_get_word( (Sentence)ptr->sentence, FIX2INT(n) );
327
+ return rb_str_new2( word );
328
+ }
329
+
330
+
331
+ /*
332
+ * call-seq:
333
+ * sentence.words -> array
334
+ *
335
+ * Returns the words of the sentence as they appear after tokenization.
336
+ *
337
+ * sentence = LinkParser::Dictionary.new.parse( "The dogs barks." )
338
+ * sentence.words #->
339
+ */
340
+ static VALUE
341
+ rlink_sentence_words( VALUE self ) {
342
+ rlink_SENTENCE *ptr = get_sentence( self );
343
+ char *word;
344
+ int i, length;
345
+ VALUE words = rb_ary_new();
346
+
347
+ length = sentence_length( (Sentence)ptr->sentence );
348
+ for ( i = 0; i < length; i++ ) {
349
+ word = sentence_get_word( (Sentence)ptr->sentence, i );
350
+ rb_ary_push( words, rb_str_new2(word) );
351
+ }
352
+
353
+ return words;
354
+ }
355
+
356
+
357
+ /*
358
+ * call-seq:
359
+ * sentence[index] -> str
360
+ * sentence[start, length] -> str
361
+ * sentence[range] -> str
362
+ *
363
+ * Element Reference---Returns the element at index, or returns a subarray
364
+ * starting at start and continuing for length elements, or returns a subarray
365
+ * specified by range. Negative indices count backward from the end of the
366
+ * array (-1 is the last element). Returns nil if the index (or starting
367
+ * index) are out of range.
368
+ *
369
+ * sent = dict.parse( "Birds fly south for the winter." )
370
+ *
371
+ * sent[1] # => "birds"
372
+ * sent[0,4] # => ["LEFT-WALL", "birds", "fly", "south"]
373
+ * sent[1..3] # => ["birds", "fly", "south"]
374
+ *
375
+ */
376
+ static VALUE
377
+ rlink_sentence_aref( argc, argv, self )
378
+ int argc;
379
+ VALUE *argv;
380
+ VALUE self;
381
+ {
382
+ VALUE words = rlink_sentence_words( self );
383
+ return rb_funcall2( words, rb_intern("[]"), argc, argv );
384
+ }
385
+
386
+
387
+ /*
388
+ * call-seq:
389
+ * sentence.null_count -> int
390
+ *
391
+ * Returns the number of null links that were used in parsing the sentence.
392
+ */
393
+ static VALUE
394
+ rlink_sentence_null_count( VALUE self ) {
395
+ rlink_SENTENCE *ptr = get_sentence( self );
396
+ int count;
397
+
398
+ count = sentence_null_count( (Sentence)ptr->sentence );
399
+ return INT2FIX( count );
400
+ }
401
+
402
+
403
+ /*
404
+ * call-seq:
405
+ * sentence.num_linkages_found -> fixnum
406
+ *
407
+ * Returns the number of linkages found when parsing the sentence. This will
408
+ * cause the sentence to be parsed if it hasn't been already.
409
+ */
410
+ static VALUE
411
+ rlink_sentence_num_linkages_found( VALUE self ) {
412
+ rlink_SENTENCE *ptr = get_sentence( self );
413
+ int i = 0;
414
+
415
+ if ( !RTEST(ptr->parsed_p) )
416
+ rlink_sentence_parse( 0, 0, self );
417
+ i = sentence_num_linkages_found( (Sentence)ptr->sentence );
418
+
419
+ return INT2FIX( i );
420
+ }
421
+
422
+
423
+ /*
424
+ * call-seq:
425
+ * sentence.num_valid_linkages -> fixnum
426
+ *
427
+ * Return the number of linkages that had no post-processing violations.
428
+ */
429
+ static VALUE
430
+ rlink_sentence_num_valid_linkages( VALUE self ) {
431
+ rlink_SENTENCE *ptr = get_sentence( self );
432
+ int count;
433
+
434
+ count = sentence_num_valid_linkages( (Sentence)ptr->sentence );
435
+ return INT2FIX( count );
436
+ }
437
+
438
+
439
+ /*
440
+ * call-seq:
441
+ * sentence.num_linkages_post_processed -> fixnum
442
+ *
443
+ * Return the number of linkages that were actually post-processed (which may
444
+ * be less than the number found because of the linkage_limit parameter).
445
+ */
446
+ static VALUE
447
+ rlink_sentence_num_linkages_post_processed( VALUE self ) {
448
+ rlink_SENTENCE *ptr = get_sentence( self );
449
+ int count;
450
+
451
+ count = sentence_num_linkages_post_processed( (Sentence)ptr->sentence );
452
+ return INT2FIX( count );
453
+ }
454
+
455
+
456
+ /*
457
+ * call-seq:
458
+ * sentence.num_violations( i ) -> fixnum
459
+ *
460
+ * The number of post-processing violations that the i-th linkage had during
461
+ * the last parse.
462
+ */
463
+ static VALUE
464
+ rlink_sentence_num_violations( VALUE self, VALUE i ) {
465
+ rlink_SENTENCE *ptr = get_sentence( self );
466
+ int count;
467
+
468
+ count = sentence_num_violations( (Sentence)ptr->sentence, FIX2INT(i) );
469
+ return INT2FIX( count );
470
+ }
471
+
472
+
473
+ /*
474
+ * call-seq:
475
+ * sentence.disjunct_cost( i ) -> fixnum
476
+ *
477
+ * The maximum cost of connectors used in the i-th linkage of the sentence.
478
+ */
479
+ static VALUE
480
+ rlink_sentence_disjunct_cost( VALUE self, VALUE i ) {
481
+ rlink_SENTENCE *ptr = get_sentence( self );
482
+ int count;
483
+
484
+ count = sentence_disjunct_cost( (Sentence)ptr->sentence, FIX2INT(i) );
485
+ return INT2FIX( count );
486
+ }
487
+
488
+
489
+ /*
490
+ * Document-class: LinkParser::Sentence
491
+ *
492
+ * A Sentence is the API's representation of an input string,
493
+ * tokenized and interpreted according to a specific Dictionary. After
494
+ * a Sentence is created and parsed, various attributes of the
495
+ * resulting set of linkages can be obtained.
496
+ *
497
+ */
498
+ void
499
+ rlink_init_sentence() {
500
+ rlink_cSentence = rb_define_class_under( rlink_mLinkParser, "Sentence",
501
+ rb_cObject );
502
+
503
+ rb_define_alloc_func( rlink_cSentence, rlink_sentence_s_alloc );
504
+
505
+ rb_define_method( rlink_cSentence, "initialize", rlink_sentence_init, 2 );
506
+ rb_define_method( rlink_cSentence, "parse", rlink_sentence_parse, -1 );
507
+ rb_define_method( rlink_cSentence, "parsed?", rlink_sentence_parsed_p, 0 );
508
+ rb_define_method( rlink_cSentence, "linkages", rlink_sentence_linkages, 0 );
509
+
510
+ rb_define_method( rlink_cSentence, "options", rlink_sentence_options, 0 );
511
+
512
+ rb_define_method( rlink_cSentence, "length", rlink_sentence_length, 0 );
513
+ rb_define_method( rlink_cSentence, "word", rlink_sentence_word, 1 );
514
+ rb_define_method( rlink_cSentence, "words", rlink_sentence_words, 0 );
515
+ rb_define_method( rlink_cSentence, "[]", rlink_sentence_aref, -1 );
516
+
517
+ rb_define_method( rlink_cSentence, "null_count",
518
+ rlink_sentence_null_count, 0 );
519
+ rb_define_method( rlink_cSentence, "num_linkages_found",
520
+ rlink_sentence_num_linkages_found, 0 );
521
+ rb_define_method( rlink_cSentence, "num_valid_linkages",
522
+ rlink_sentence_num_valid_linkages, 0 );
523
+ rb_define_method( rlink_cSentence, "num_linkages_post_processed",
524
+ rlink_sentence_num_linkages_post_processed, 0 );
525
+ rb_define_method( rlink_cSentence, "num_violations",
526
+ rlink_sentence_num_violations, 1 );
527
+ rb_define_method( rlink_cSentence, "disjunct_cost",
528
+ rlink_sentence_disjunct_cost, 1 );
529
+
530
+ /*
531
+ link_public_api(char *) sentence_get_nth_word(Sentence sent, int i);
532
+ link_public_api(int) sentence_nth_word_has_disjunction(Sentence sent, int i);
533
+ */
534
+
535
+ }
536
+