linkparser 1.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/ext/sentence.c ADDED
@@ -0,0 +1,536 @@
1
+ /*
2
+ * sentence.c - Ruby LinkParser
3
+ * $Id: sentence.c 48 2008-12-19 18:30:33Z deveiant $
4
+ *
5
+ * Authors:
6
+ * * Michael Granger <ged@FaerieMUD.org>
7
+ *
8
+ * Please see the LICENSE file at the top of the distribution for licensing
9
+ * information.
10
+ */
11
+
12
+ #include "linkparser.h"
13
+
14
+
15
+ /* --------------------------------------------------
16
+ * Forward declarations
17
+ * -------------------------------------------------- */
18
+
19
+
20
+ /* --------------------------------------------------
21
+ * Macros and constants
22
+ * -------------------------------------------------- */
23
+
24
+
25
+ /* --------------------------------------------------
26
+ * Memory-management functions
27
+ * -------------------------------------------------- */
28
+
29
+ /*
30
+ * Allocation function
31
+ */
32
+ static rlink_SENTENCE *
33
+ rlink_sentence_alloc() {
34
+ rlink_SENTENCE *ptr = ALLOC( rlink_SENTENCE );
35
+
36
+ ptr->sentence = NULL;
37
+ ptr->dictionary = Qnil;
38
+ ptr->parsed_p = Qfalse;
39
+ ptr->options = Qnil;
40
+
41
+ debugMsg(( "Initialized an rlink_SENTENCE <%p>", ptr ));
42
+ return ptr;
43
+ }
44
+
45
+
46
+ /*
47
+ * GC Mark function
48
+ */
49
+ static void
50
+ rlink_sentence_gc_mark( rlink_SENTENCE *ptr ) {
51
+ debugMsg(( "Marking LinkParser::Sentence %p", ptr ));
52
+
53
+ if ( ptr ) {
54
+ rb_gc_mark( ptr->dictionary );
55
+ rb_gc_mark( ptr->options );
56
+ }
57
+
58
+ else {
59
+ debugMsg(( "Not marking uninitialized rlink_SENTENCE" ));
60
+ }
61
+ }
62
+
63
+
64
+ /*
65
+ * GC Free function
66
+ */
67
+ static void
68
+ rlink_sentence_gc_free( rlink_SENTENCE *ptr ) {
69
+ if ( ptr ) {
70
+ debugMsg(( "In free function of Sentence <%p>", ptr ));
71
+
72
+ if ( rlink_get_dict(ptr->dictionary) ) {
73
+ debugMsg(( "Freeing Sentence <%p>", ptr->sentence ));
74
+ sentence_delete( (Sentence)ptr->sentence );
75
+ } else {
76
+ debugMsg(( "Not freeing a Sentence belonging to an already-freed dictionary." ));
77
+ }
78
+
79
+ ptr->sentence = NULL;
80
+ ptr->options = Qnil;
81
+ ptr->dictionary = Qnil;
82
+ }
83
+
84
+ else {
85
+ debugMsg(( "Not freeing an uninitialized rlink_SENTENCE" ));
86
+ }
87
+ }
88
+
89
+
90
+ /*
91
+ * Object validity checker. Returns the data pointer.
92
+ */
93
+ static rlink_SENTENCE *
94
+ check_sentence( VALUE self ) {
95
+ Check_Type( self, T_DATA );
96
+
97
+ if ( !IsSentence(self) ) {
98
+ rb_raise( rb_eTypeError, "wrong argument type %s (expected LinkParser::Sentence)",
99
+ rb_class2name(CLASS_OF( self )) );
100
+ }
101
+
102
+ return DATA_PTR( self );
103
+ }
104
+
105
+
106
+ /*
107
+ * Fetch the data pointer and check it for sanity.
108
+ */
109
+ static rlink_SENTENCE *
110
+ get_sentence( VALUE self ) {
111
+ rlink_SENTENCE *ptr = check_sentence( self );
112
+
113
+ if ( !ptr )
114
+ rb_raise( rb_eRuntimeError, "uninitialized Sentence" );
115
+
116
+ return ptr;
117
+ }
118
+
119
+
120
+ /*
121
+ * Publicly-usable sentence-fetcher
122
+ */
123
+ rlink_SENTENCE *
124
+ rlink_get_sentence( VALUE self ) {
125
+ return get_sentence( self );
126
+ }
127
+
128
+
129
+
130
+ /* --------------------------------------------------
131
+ * Class Methods
132
+ * -------------------------------------------------- */
133
+
134
+ /*
135
+ * call-seq:
136
+ * LinkParser::Sentence.allocate -> sentence
137
+ *
138
+ * Allocate a new LinkParser::Sentence object.
139
+ *
140
+ */
141
+ static VALUE
142
+ rlink_sentence_s_alloc( VALUE klass ) {
143
+ debugMsg(( "Wrapping an uninitialized Sentence pointer." ));
144
+ return Data_Wrap_Struct( klass, rlink_sentence_gc_mark, rlink_sentence_gc_free, 0 );
145
+ }
146
+
147
+
148
+ /* --------------------
149
+ * Instance methods
150
+ * -------------------- */
151
+
152
+ /*
153
+ * call-seq:
154
+ * LinkParser::Sentence.new( str, dict ) -> sentence
155
+ *
156
+ * Create a new LinkParser::Sentence object from the given input string
157
+ # using the specified LinkParser::Dictionary.
158
+ *
159
+ * dict = LinkParser::Dictionary.new
160
+ * LinkParser::Sentence.new( "The boy runs", dict ) #=> #<LinkParser::Sentence:0x5481ac>
161
+ */
162
+ static VALUE
163
+ rlink_sentence_init( VALUE self, VALUE input_string, VALUE dictionary ) {
164
+ if ( !check_sentence(self) ) {
165
+ rlink_SENTENCE *ptr;
166
+ Sentence sent;
167
+ Dictionary dict = rlink_get_dict( dictionary );
168
+
169
+ if ( !(sent = sentence_create( StringValueCStr(input_string), dict )) )
170
+ rlink_raise_lp_error();
171
+
172
+ DATA_PTR( self ) = ptr = rlink_sentence_alloc();
173
+
174
+ ptr->sentence = sent;
175
+ ptr->dictionary = dictionary;
176
+ ptr->options = Qnil;
177
+
178
+ } else {
179
+ rb_raise( rb_eRuntimeError,
180
+ "Cannot re-initialize a sentence once it's been created." );
181
+ }
182
+
183
+ return self;
184
+ }
185
+
186
+
187
+ /*
188
+ * call-seq:
189
+ * sentence.parse( options={} ) -> fixnum
190
+ *
191
+ * Attach a parse set to this sentence and return the number of linkages
192
+ * found. If any +options+ are specified, they override those set in the
193
+ * sentence's dictionary.
194
+ *
195
+ */
196
+ static VALUE
197
+ rlink_sentence_parse( int argc, VALUE *argv, VALUE self ) {
198
+ rlink_SENTENCE *ptr = get_sentence( self );
199
+ Parse_Options opts;
200
+ VALUE defopts = Qnil;
201
+ VALUE options = Qnil;
202
+ int link_count = 0;
203
+
204
+ if ( RTEST(ptr->parsed_p) )
205
+ rb_raise( rlink_eLpError, "Can't reparse a sentence." );
206
+
207
+ /* Merge the hash from this call with the one from the dict and build
208
+ Parse_Options from it. */
209
+ rb_scan_args( argc, argv, "01", &options );
210
+ defopts = rb_funcall( ptr->dictionary, rb_intern("options"), 0 );
211
+
212
+ /* Turn the option hash into a ParseOptions object, then extract the
213
+ Parse_Options struct from that */
214
+ options = rlink_make_parse_options( defopts, options );
215
+ opts = rlink_get_parseopts( options );
216
+
217
+ /* Parse the sentence */
218
+ if ( (link_count = sentence_parse( ptr->sentence, opts )) < 0 )
219
+ rlink_raise_lp_error();
220
+
221
+ ptr->options = options;
222
+ ptr->parsed_p = Qtrue;
223
+
224
+ return INT2FIX( link_count );
225
+ }
226
+
227
+
228
+ /*
229
+ * call-seq:
230
+ * sentence.parsed? -> true or false
231
+ *
232
+ * Returns +true+ if the sentence has been parsed.
233
+ *
234
+ * sentence.parsed? #-> false
235
+ * sentence.parse #-> 6
236
+ * sentence.parsed? #-> true
237
+ */
238
+ static VALUE
239
+ rlink_sentence_parsed_p( VALUE self ) {
240
+ rlink_SENTENCE *ptr = get_sentence( self );
241
+ return ptr->parsed_p;
242
+ }
243
+
244
+
245
+ /*
246
+ * call-seq:
247
+ * sentence.options -> parseoptions
248
+ *
249
+ * Returns a ParseOptions object for the receiving sentence.
250
+ *
251
+ * sentence.options.verbosity = 3
252
+ * sentence.options.islands_ok? # -> true
253
+ */
254
+ static VALUE
255
+ rlink_sentence_options( VALUE self ) {
256
+ rlink_SENTENCE *ptr = get_sentence( self );
257
+ return ptr->options;
258
+ }
259
+
260
+
261
+
262
+
263
+ /*
264
+ * call-seq:
265
+ * sentence.linkages -> array
266
+ *
267
+ * Returns an Array of LinkParser::Linkage objects which represent the
268
+ * parts parsed from the sentence for the current linkage.
269
+ *
270
+ */
271
+ static VALUE
272
+ rlink_sentence_linkages( VALUE self ) {
273
+ rlink_SENTENCE *ptr = get_sentence( self );
274
+ int i, count = 0;
275
+ VALUE rary;
276
+
277
+ if ( !RTEST(ptr->parsed_p) )
278
+ rlink_sentence_parse( 0, 0, self );
279
+
280
+ count = sentence_num_valid_linkages( (Sentence)ptr->sentence );
281
+ rary = rb_ary_new2( count );
282
+
283
+ for ( i = 0; i < count; i++ ) {
284
+ VALUE linkage;
285
+ VALUE args[2];
286
+
287
+ args[0] = INT2FIX( i );
288
+ args[1] = self;
289
+
290
+ linkage = rb_class_new_instance( 2, args, rlink_cLinkage );
291
+ rb_ary_store( rary, i, linkage );
292
+ }
293
+
294
+ return rary;
295
+ }
296
+
297
+
298
+ /*
299
+ * call-seq:
300
+ * sentence.length -> fixnum
301
+ *
302
+ * Returns the number of words in the tokenized sentence, including the
303
+ * boundary words and punctuation.
304
+ *
305
+ */
306
+
307
+ static VALUE
308
+ rlink_sentence_length( VALUE self ) {
309
+ rlink_SENTENCE *ptr = get_sentence( self );
310
+ return INT2FIX( sentence_length((Sentence)ptr->sentence) );
311
+ }
312
+
313
+
314
+ /*
315
+ * call-seq:
316
+ * sentence.word( idx ) -> str
317
+ *
318
+ * Returns the spelling of the n-th word in the sentence as it appears after
319
+ * tokenization.
320
+ */
321
+ static VALUE
322
+ rlink_sentence_word( VALUE self, VALUE n ) {
323
+ rlink_SENTENCE *ptr = get_sentence( self );
324
+ char *word;
325
+
326
+ word = sentence_get_word( (Sentence)ptr->sentence, FIX2INT(n) );
327
+ return rb_str_new2( word );
328
+ }
329
+
330
+
331
+ /*
332
+ * call-seq:
333
+ * sentence.words -> array
334
+ *
335
+ * Returns the words of the sentence as they appear after tokenization.
336
+ *
337
+ * sentence = LinkParser::Dictionary.new.parse( "The dogs barks." )
338
+ * sentence.words #->
339
+ */
340
+ static VALUE
341
+ rlink_sentence_words( VALUE self ) {
342
+ rlink_SENTENCE *ptr = get_sentence( self );
343
+ char *word;
344
+ int i, length;
345
+ VALUE words = rb_ary_new();
346
+
347
+ length = sentence_length( (Sentence)ptr->sentence );
348
+ for ( i = 0; i < length; i++ ) {
349
+ word = sentence_get_word( (Sentence)ptr->sentence, i );
350
+ rb_ary_push( words, rb_str_new2(word) );
351
+ }
352
+
353
+ return words;
354
+ }
355
+
356
+
357
+ /*
358
+ * call-seq:
359
+ * sentence[index] -> str
360
+ * sentence[start, length] -> str
361
+ * sentence[range] -> str
362
+ *
363
+ * Element Reference---Returns the element at index, or returns a subarray
364
+ * starting at start and continuing for length elements, or returns a subarray
365
+ * specified by range. Negative indices count backward from the end of the
366
+ * array (-1 is the last element). Returns nil if the index (or starting
367
+ * index) are out of range.
368
+ *
369
+ * sent = dict.parse( "Birds fly south for the winter." )
370
+ *
371
+ * sent[1] # => "birds"
372
+ * sent[0,4] # => ["LEFT-WALL", "birds", "fly", "south"]
373
+ * sent[1..3] # => ["birds", "fly", "south"]
374
+ *
375
+ */
376
+ static VALUE
377
+ rlink_sentence_aref( argc, argv, self )
378
+ int argc;
379
+ VALUE *argv;
380
+ VALUE self;
381
+ {
382
+ VALUE words = rlink_sentence_words( self );
383
+ return rb_funcall2( words, rb_intern("[]"), argc, argv );
384
+ }
385
+
386
+
387
+ /*
388
+ * call-seq:
389
+ * sentence.null_count -> int
390
+ *
391
+ * Returns the number of null links that were used in parsing the sentence.
392
+ */
393
+ static VALUE
394
+ rlink_sentence_null_count( VALUE self ) {
395
+ rlink_SENTENCE *ptr = get_sentence( self );
396
+ int count;
397
+
398
+ count = sentence_null_count( (Sentence)ptr->sentence );
399
+ return INT2FIX( count );
400
+ }
401
+
402
+
403
+ /*
404
+ * call-seq:
405
+ * sentence.num_linkages_found -> fixnum
406
+ *
407
+ * Returns the number of linkages found when parsing the sentence. This will
408
+ * cause the sentence to be parsed if it hasn't been already.
409
+ */
410
+ static VALUE
411
+ rlink_sentence_num_linkages_found( VALUE self ) {
412
+ rlink_SENTENCE *ptr = get_sentence( self );
413
+ int i = 0;
414
+
415
+ if ( !RTEST(ptr->parsed_p) )
416
+ rlink_sentence_parse( 0, 0, self );
417
+ i = sentence_num_linkages_found( (Sentence)ptr->sentence );
418
+
419
+ return INT2FIX( i );
420
+ }
421
+
422
+
423
+ /*
424
+ * call-seq:
425
+ * sentence.num_valid_linkages -> fixnum
426
+ *
427
+ * Return the number of linkages that had no post-processing violations.
428
+ */
429
+ static VALUE
430
+ rlink_sentence_num_valid_linkages( VALUE self ) {
431
+ rlink_SENTENCE *ptr = get_sentence( self );
432
+ int count;
433
+
434
+ count = sentence_num_valid_linkages( (Sentence)ptr->sentence );
435
+ return INT2FIX( count );
436
+ }
437
+
438
+
439
+ /*
440
+ * call-seq:
441
+ * sentence.num_linkages_post_processed -> fixnum
442
+ *
443
+ * Return the number of linkages that were actually post-processed (which may
444
+ * be less than the number found because of the linkage_limit parameter).
445
+ */
446
+ static VALUE
447
+ rlink_sentence_num_linkages_post_processed( VALUE self ) {
448
+ rlink_SENTENCE *ptr = get_sentence( self );
449
+ int count;
450
+
451
+ count = sentence_num_linkages_post_processed( (Sentence)ptr->sentence );
452
+ return INT2FIX( count );
453
+ }
454
+
455
+
456
+ /*
457
+ * call-seq:
458
+ * sentence.num_violations( i ) -> fixnum
459
+ *
460
+ * The number of post-processing violations that the i-th linkage had during
461
+ * the last parse.
462
+ */
463
+ static VALUE
464
+ rlink_sentence_num_violations( VALUE self, VALUE i ) {
465
+ rlink_SENTENCE *ptr = get_sentence( self );
466
+ int count;
467
+
468
+ count = sentence_num_violations( (Sentence)ptr->sentence, FIX2INT(i) );
469
+ return INT2FIX( count );
470
+ }
471
+
472
+
473
+ /*
474
+ * call-seq:
475
+ * sentence.disjunct_cost( i ) -> fixnum
476
+ *
477
+ * The maximum cost of connectors used in the i-th linkage of the sentence.
478
+ */
479
+ static VALUE
480
+ rlink_sentence_disjunct_cost( VALUE self, VALUE i ) {
481
+ rlink_SENTENCE *ptr = get_sentence( self );
482
+ int count;
483
+
484
+ count = sentence_disjunct_cost( (Sentence)ptr->sentence, FIX2INT(i) );
485
+ return INT2FIX( count );
486
+ }
487
+
488
+
489
+ /*
490
+ * Document-class: LinkParser::Sentence
491
+ *
492
+ * A Sentence is the API's representation of an input string,
493
+ * tokenized and interpreted according to a specific Dictionary. After
494
+ * a Sentence is created and parsed, various attributes of the
495
+ * resulting set of linkages can be obtained.
496
+ *
497
+ */
498
+ void
499
+ rlink_init_sentence() {
500
+ rlink_cSentence = rb_define_class_under( rlink_mLinkParser, "Sentence",
501
+ rb_cObject );
502
+
503
+ rb_define_alloc_func( rlink_cSentence, rlink_sentence_s_alloc );
504
+
505
+ rb_define_method( rlink_cSentence, "initialize", rlink_sentence_init, 2 );
506
+ rb_define_method( rlink_cSentence, "parse", rlink_sentence_parse, -1 );
507
+ rb_define_method( rlink_cSentence, "parsed?", rlink_sentence_parsed_p, 0 );
508
+ rb_define_method( rlink_cSentence, "linkages", rlink_sentence_linkages, 0 );
509
+
510
+ rb_define_method( rlink_cSentence, "options", rlink_sentence_options, 0 );
511
+
512
+ rb_define_method( rlink_cSentence, "length", rlink_sentence_length, 0 );
513
+ rb_define_method( rlink_cSentence, "word", rlink_sentence_word, 1 );
514
+ rb_define_method( rlink_cSentence, "words", rlink_sentence_words, 0 );
515
+ rb_define_method( rlink_cSentence, "[]", rlink_sentence_aref, -1 );
516
+
517
+ rb_define_method( rlink_cSentence, "null_count",
518
+ rlink_sentence_null_count, 0 );
519
+ rb_define_method( rlink_cSentence, "num_linkages_found",
520
+ rlink_sentence_num_linkages_found, 0 );
521
+ rb_define_method( rlink_cSentence, "num_valid_linkages",
522
+ rlink_sentence_num_valid_linkages, 0 );
523
+ rb_define_method( rlink_cSentence, "num_linkages_post_processed",
524
+ rlink_sentence_num_linkages_post_processed, 0 );
525
+ rb_define_method( rlink_cSentence, "num_violations",
526
+ rlink_sentence_num_violations, 1 );
527
+ rb_define_method( rlink_cSentence, "disjunct_cost",
528
+ rlink_sentence_disjunct_cost, 1 );
529
+
530
+ /*
531
+ link_public_api(char *) sentence_get_nth_word(Sentence sent, int i);
532
+ link_public_api(int) sentence_nth_word_has_disjunction(Sentence sent, int i);
533
+ */
534
+
535
+ }
536
+