gr_string_escape 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,5 @@
1
+ README.rdoc
2
+ lib/**/*.rb
3
+ bin/*
4
+ features/**/*.feature
5
+ LICENSE
@@ -0,0 +1,22 @@
1
+ ## MAC OS
2
+ .DS_Store
3
+
4
+ ## TEXTMATE
5
+ *.tmproj
6
+ tmtags
7
+
8
+ ## EMACS
9
+ *~
10
+ \#*
11
+ .\#*
12
+
13
+ ## VIM
14
+ *.swp
15
+
16
+ ## PROJECT::GENERAL
17
+ *.bundle
18
+ coverage
19
+ rdoc
20
+ pkg
21
+ tmp
22
+ ## PROJECT::SPECIFIC
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2009 Curtis Schofield
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,17 @@
1
+ = gr_string_escape
2
+
3
+ Description goes here.
4
+
5
+ == Note on Patches/Pull Requests
6
+
7
+ * Fork the project.
8
+ * Make your feature addition or bug fix.
9
+ * Add tests for it. This is important so I don't break it in a
10
+ future version unintentionally.
11
+ * Commit, do not mess with rakefile, version, or history.
12
+ (if you want to have your own version, that is fine but bump version in a commit by itself I can ignore when I pull)
13
+ * Send me a pull request. Bonus points for topic branches.
14
+
15
+ == Copyright
16
+
17
+ Copyright (c) 2010 Curtis Schofield. See LICENSE for details.
@@ -0,0 +1,61 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+ jeweler_tasks = nil
4
+ begin
5
+ require 'jeweler'
6
+ jeweler_tasks = Jeweler::Tasks.new do |gem|
7
+ gem.name = "gr_string_escape"
8
+ gem.summary = %Q{Goodreads string parser}
9
+ gem.description = %Q{Code for Goodreads String Parsing}
10
+ gem.email = "github.com@robotarmyma.de"
11
+ gem.homepage = "http://github.com/robotarmy/gr_string_escape"
12
+ gem.authors = ["Michael Economy","Curtis Schofield"]
13
+ gem.extensions = FileList['ext/**/extconf.rb']
14
+ # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
15
+ end
16
+ Jeweler::GemcutterTasks.new
17
+ rescue LoadError
18
+ puts "Jeweler missing : \n gem install jeweler"
19
+ end
20
+ begin
21
+ require 'rake/extensiontask'
22
+ rescue LoadError
23
+ puts "rake-compiler missing : \n gem install rake-compiler"
24
+ end
25
+ Rake::ExtensionTask.new('gr_string_escape', jeweler_tasks.gemspec)
26
+ CLEAN.include 'lib/**/*.so'
27
+
28
+
29
+ require 'rake/testtask'
30
+ Rake::TestTask.new(:test) do |test|
31
+ test.libs << 'lib' << 'test'
32
+ test.pattern = 'test/**/test_*.rb'
33
+ test.verbose = true
34
+ end
35
+
36
+ begin
37
+ require 'rcov/rcovtask'
38
+ Rcov::RcovTask.new do |test|
39
+ test.libs << 'test'
40
+ test.pattern = 'test/**/test_*.rb'
41
+ test.verbose = true
42
+ end
43
+ rescue LoadError
44
+ task :rcov do
45
+ abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
46
+ end
47
+ end
48
+
49
+ task :test => :check_dependencies
50
+
51
+ task :default => [:compile,:test]
52
+
53
+ require 'rake/rdoctask'
54
+ Rake::RDocTask.new do |rdoc|
55
+ version = File.exist?('VERSION') ? File.read('VERSION') : ""
56
+
57
+ rdoc.rdoc_dir = 'rdoc'
58
+ rdoc.title = "gr_string_escape #{version}"
59
+ rdoc.rdoc_files.include('README*')
60
+ rdoc.rdoc_files.include('lib/**/*.rb')
61
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.3.1
@@ -0,0 +1,4 @@
1
+ require 'mkmf'
2
+ extention_name = "gr_string_escape"
3
+ dir_config(extention_name)
4
+ create_makefile(extention_name)
@@ -0,0 +1,1162 @@
1
+ #include "ruby.h"
2
+ #include <stdio.h>
3
+ #include <stdlib.h>
4
+ #include <string.h>
5
+
6
+
7
+ #define MAX_STACK_DEPTH 20
8
+ #define MAX_URL_PRINT_SIZE 40
9
+ #define MAX_ATTRIBUTES 10
10
+ #define DYNAMICS_INCREMENT 128
11
+ #define STRONG "strong"
12
+ #define EM "em"
13
+ #define FALSE 0
14
+ #define TRUE 1
15
+ #define EMPTY_STRING ""
16
+
17
+ static int id_push; // ruby thing
18
+
19
+ int input_size, output_size, max_output_size, absolute_url_size;
20
+ int position;
21
+ int ahead_position;
22
+ char **tag_stack;
23
+ int tag_stack_size;
24
+ int anchors_in_stack;
25
+ int cap_count;
26
+ int writing_utf_8; //set to the size of the utf_8 char remaining
27
+ int counting_cap;
28
+
29
+ char* input;
30
+ char* absolute_url;
31
+ char* output;
32
+
33
+ static void downcase(char *string) {
34
+ int i;
35
+ for(i=0; string[i]; i++) {
36
+ if (string[i] >= 'A' && string[i] <= 'Z'){
37
+ string[i] += 32;
38
+ }
39
+ }
40
+ }
41
+
42
+ static void downcasen(char *string, int string_size) {
43
+ int i;
44
+ //downcase
45
+ for(i=0; i < string_size; i++) {
46
+ if (string[i] >= 'A' && string[i] <= 'Z'){
47
+ string[i] += 32;
48
+ }
49
+ }
50
+ }
51
+
52
+ static void write_chars(char *chars) {
53
+ int size = strlen(chars);
54
+ if(output_size + size >= max_output_size) {
55
+ printf("Error: max_output_size is being exceeded\n");
56
+ return;
57
+ }
58
+ strncpy(output + output_size, chars, size);
59
+ output_size += size;
60
+ if(counting_cap) {
61
+ cap_count += size;
62
+ }
63
+ }
64
+
65
+ inline push_to_tag_stack(char *string, int string_size) {
66
+ char *new_string;
67
+ new_string = ALLOC_N(char, string_size + 1);
68
+ strcpy(new_string, string);
69
+ tag_stack[tag_stack_size++] = new_string;
70
+ }
71
+
72
+ inline write_nchars(char *chars, int chars_size) {
73
+ if(output_size + chars_size >= max_output_size) {
74
+ printf("Error: max_output_size is being exceeded\n");
75
+ return;
76
+ }
77
+ strncpy(output + output_size, chars, chars_size);
78
+ output_size += chars_size;
79
+ if(counting_cap) {
80
+ cap_count += chars_size;
81
+ }
82
+ }
83
+
84
+ inline void write_char(char char_to_write) {
85
+ if(output_size + 1 >= max_output_size) {
86
+ printf("Error: max_output_size is being exceeded\n");
87
+ return;
88
+ }
89
+
90
+ output[output_size++] = char_to_write;
91
+ if (writing_utf_8) {
92
+ if(char_to_write & 0x80 && !(char_to_write & 0x40)) {
93
+ writing_utf_8 --;
94
+ }
95
+ else {
96
+ writing_utf_8 = 0;
97
+ }
98
+ }
99
+ else {
100
+ if(char_to_write & 0x80) {
101
+ if (char_to_write & 0x40) {
102
+ writing_utf_8 ++;
103
+ if (char_to_write & 0x20) {
104
+ writing_utf_8 ++;
105
+ if (char_to_write & 0x10) {
106
+ writing_utf_8 ++;
107
+ }
108
+ }
109
+ }
110
+ }
111
+ }
112
+ if(counting_cap && !writing_utf_8) {
113
+ cap_count++;
114
+ }
115
+ }
116
+
117
+
118
+ static void write_escaped_chars(char *chars) {
119
+ int i = 0;
120
+ char c;
121
+ for (i = 0; c = chars[i]; i++) {
122
+ switch(c) {
123
+ case '&':
124
+ write_chars("&amp;");
125
+ break;
126
+ case '>':
127
+ write_chars("&gt;");
128
+ break;
129
+ case '<':
130
+ write_chars("&lt;");
131
+ break;
132
+ case '"':
133
+ write_chars("&quot;");
134
+ break;
135
+ default:
136
+ write_char(c);
137
+ break;
138
+ }
139
+ }
140
+ }
141
+
142
+
143
+
144
+
145
+ static void write_urlitized_chars(char *chars) {
146
+ int i = 0;
147
+ int wrote_underscore = FALSE;
148
+ char c;
149
+ for (i = 0; c = chars[i]; i++) {
150
+ if((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') ||
151
+ (c >= '0' && c <= '9')) {
152
+ write_char(c);
153
+ wrote_underscore = FALSE;
154
+ }
155
+ else {
156
+ if (!wrote_underscore) {
157
+ wrote_underscore = TRUE;
158
+ write_char('_');
159
+ }
160
+ }
161
+ }
162
+ }
163
+
164
+ static int valid_url(char * string){
165
+ if (!string || strlen(string) <= 4) {
166
+ return FALSE;
167
+ }
168
+ // disallow javascript tags
169
+ if(string[0] != 'j' && string[0] != 'J') {
170
+ return TRUE;
171
+ }
172
+
173
+ if(string[1] != 'a' && string[1] != 'A') {
174
+ return TRUE;
175
+ }
176
+
177
+ if(string[2] != 'v' && string[2] != 'V') {
178
+ return TRUE;
179
+ }
180
+
181
+ if(string[3] != 'a' && string[3] != 'A') {
182
+ return TRUE;
183
+ }
184
+ return FALSE;
185
+ }
186
+
187
+
188
+ static int attribute_find(char *key, char **keys, int num_keys) {
189
+ int i;
190
+
191
+ // printf("finding attr: %s\n", key);
192
+ for(i=0; i< num_keys; i++) {
193
+ if(strcmp(key, keys[i]) == 0) {
194
+ // printf("found!: %d\n", i);
195
+ return i;
196
+ }
197
+ }
198
+ return -1;
199
+ }
200
+
201
+ static int html_parse(int start, int finish) {
202
+ int close_tag = FALSE;
203
+ int tag_closed = FALSE;
204
+ char *tag;
205
+ char *name;
206
+ char c;
207
+ int i;
208
+ char *attribute_keys[MAX_ATTRIBUTES];
209
+ char *attribute_values[MAX_ATTRIBUTES];
210
+ int num_attributes = 0;
211
+ int tag_size;
212
+ int name_size = 0;
213
+
214
+ tag = input + start;
215
+ tag_size = finish - start;
216
+
217
+ // printf("tag start: %c tag_size:%d\n", tag[0], tag_size);
218
+ if(tag_size <= 0) {
219
+ return FALSE;
220
+ }
221
+
222
+ // read all the whitespace and first slash "< / "
223
+ for(; tag_size > 0; tag++, tag_size--) {
224
+ int done = FALSE;
225
+ switch(c = tag[0]) {
226
+ case '/':
227
+ if (close_tag){
228
+ return FALSE;
229
+ }
230
+ else {
231
+ close_tag = TRUE;
232
+ }
233
+ break;
234
+ case ' ':
235
+ case '\t':
236
+ case '\n':
237
+ // ignore
238
+ break;
239
+ default:
240
+ done = TRUE;
241
+ break;
242
+ }
243
+ if(done){
244
+ break;
245
+ }
246
+ }
247
+
248
+ // read all the whitespace and last slash
249
+ for(; tag_size > 0; tag_size--) {
250
+ int done = FALSE;
251
+ switch(c = tag[tag_size - 1]) {
252
+ case '/':
253
+ if (tag_closed){
254
+ return FALSE;
255
+ }
256
+ else {
257
+ tag_closed = TRUE;
258
+ }
259
+ break;
260
+ case ' ':
261
+ case '\t':
262
+ case '\n':
263
+ // ignore
264
+ break;
265
+ default:
266
+ done = TRUE;
267
+ break;
268
+ }
269
+ if(done){
270
+ break;
271
+ }
272
+ }
273
+
274
+ if(tag_size == 0) {
275
+ return FALSE;
276
+ }
277
+
278
+ //read name
279
+ for(name_size = 0; name_size < tag_size; name_size++) {
280
+ int done = FALSE;
281
+ switch(tag[name_size]) {
282
+ case ' ':
283
+ case '\t':
284
+ case '\n':
285
+ done = TRUE;
286
+ break;
287
+ }
288
+ if(done) {
289
+ break;
290
+ }
291
+ }
292
+ name = ALLOCA_N(char, name_size + 1);
293
+ strncpy(name, tag, name_size);
294
+ name[name_size] = 0;
295
+
296
+ // printf("name_size %d tag name: %s\n", name_size, name);
297
+
298
+ tag_size -= name_size;
299
+ tag += name_size;
300
+
301
+
302
+ // read attributes
303
+ for(; tag_size > 0; tag_size--, tag++) {
304
+ char *key = tag;
305
+ char *value = NULL;
306
+ int key_size = 0;
307
+ int value_size = 0; // would you like fries with that?
308
+ int skip_value = FALSE;
309
+
310
+ // read key
311
+ for(; key_size < tag_size; key_size++) {
312
+ int done = FALSE;
313
+ c = tag[key_size];
314
+ switch(c) {
315
+ case ' ':
316
+ case '\t':
317
+ case '\n':
318
+ if (key_size > 0) {
319
+ done = TRUE;
320
+ }
321
+ else {
322
+ //ignore preceding whitespace
323
+ key++;
324
+ key_size--;
325
+ tag++;
326
+ tag_size--;
327
+ }
328
+ break;
329
+ case '=':
330
+ if (key_size > 0) {
331
+ done = TRUE;
332
+ // do not advance, equals still needs to be found
333
+ }
334
+ else {
335
+ return FALSE;
336
+ }
337
+ break;
338
+ case '\'':
339
+ case '"':
340
+ return FALSE;
341
+ }
342
+ if(done) {
343
+ break;
344
+ }
345
+ } // got key
346
+ key = ALLOCA_N(char, key_size + 1);
347
+ strncpy(key, tag, key_size);
348
+ key[key_size] = 0;
349
+ tag += key_size;
350
+ tag_size -= key_size;
351
+
352
+ for(;tag_size > 0; tag++, tag_size--) {
353
+ int done = FALSE;
354
+ switch(tag[0]) {
355
+ case ' ':
356
+ case '\t':
357
+ case '\n':
358
+ //ignore whitespace
359
+ break;
360
+ case '=':
361
+ done = TRUE;
362
+ tag++;
363
+ tag_size--;
364
+ break;
365
+ default:
366
+ skip_value = TRUE;
367
+ done = TRUE;
368
+ break;
369
+ }
370
+ if(done) {
371
+ break;
372
+ }
373
+ }
374
+
375
+ if(!skip_value) {
376
+ // read value
377
+ int started_reading_value = FALSE;
378
+ for(; value_size < tag_size; value_size++) {
379
+ int done = FALSE;
380
+ c = tag[value_size];
381
+ switch(c) {
382
+ case ' ':
383
+ case '\t':
384
+ case '\n':
385
+ if (started_reading_value) {
386
+ done = TRUE;
387
+ }
388
+ else {
389
+ //ignore preceding whitespace
390
+ value_size--;
391
+ tag++;
392
+ tag_size--;
393
+ }
394
+ break;
395
+ case '=':
396
+ return FALSE;
397
+ case '\'':
398
+ case '"':
399
+ if(!value) {
400
+ started_reading_value = TRUE;
401
+ tag++;
402
+ tag_size --;
403
+ while(value_size <= tag_size && tag[value_size] != c) {
404
+ value_size++;
405
+ }
406
+ if(tag[value_size] != c){
407
+ return FALSE;
408
+ }
409
+ done = TRUE;
410
+ }
411
+ else {
412
+ return FALSE;
413
+ }
414
+ break;
415
+ default:
416
+ if (!started_reading_value) {
417
+ started_reading_value = TRUE;
418
+ }
419
+ break;
420
+ }
421
+ if(done) {
422
+ break;
423
+ }
424
+ }
425
+
426
+ if(started_reading_value) {
427
+ value = ALLOCA_N(char, value_size + 1);
428
+ strncpy(value, tag, value_size);
429
+ value[value_size] = 0;
430
+ tag += value_size;
431
+ tag_size -= value_size;
432
+ }
433
+ }
434
+
435
+ if (key_size > 0) {
436
+ if (num_attributes >= MAX_ATTRIBUTES) {
437
+ break;
438
+ }
439
+ attribute_keys[num_attributes] = key;
440
+ if (value) {
441
+ attribute_values[num_attributes] = value;
442
+ }
443
+ else {
444
+ attribute_values[num_attributes] = EMPTY_STRING;
445
+ }
446
+ num_attributes ++;
447
+ }
448
+ }
449
+
450
+ for(i = 0; i< num_attributes; i++) {
451
+ downcase(attribute_keys[i]);
452
+ }
453
+
454
+ //clean tag!
455
+ downcasen(name, name_size);
456
+ if(strcmp(name, "b") == 0){
457
+ name = STRONG;
458
+ }
459
+ else if(strcmp(name, "i") == 0) {
460
+ name = EM;
461
+ }
462
+
463
+ if(close_tag) {
464
+ if(tag_stack_size == 0){
465
+ return FALSE;
466
+ }
467
+ if(strcmp(tag_stack[tag_stack_size - 1], name) == 0){
468
+ if(strcmp(name, "a")) {
469
+ anchors_in_stack--;
470
+ }
471
+ write_chars("</");
472
+ write_chars(tag_stack[--tag_stack_size]);
473
+ write_char('>');
474
+ }
475
+ }
476
+ else { //not a close tag
477
+ if(tag_stack_size >= MAX_STACK_DEPTH) {
478
+ return FALSE;
479
+ }
480
+ if(strcmp(EM, name) == 0 || strcmp(STRONG, name) == 0 ||
481
+ strcmp("s", name) == 0 || strcmp("u", name) == 0 ||
482
+ strcmp("p", name) == 0 || strcmp("blockquote", name) == 0 ||
483
+ strcmp("pre", name) == 0){
484
+ if(tag_closed) {
485
+ return FALSE;
486
+ }
487
+ if(num_attributes != 0) {
488
+ return FALSE;
489
+ }
490
+ push_to_tag_stack(name, name_size);
491
+ write_char('<');
492
+ write_chars(name);
493
+ write_char('>');
494
+ }
495
+ else if(strcmp("br", name) == 0) {
496
+ if(num_attributes != 0) {
497
+ return FALSE;
498
+ }
499
+ write_chars("<br/>");
500
+ }
501
+ else if(strcmp("a", name) == 0) {
502
+ int href_pos;
503
+
504
+ // printf("trying A tag\n");
505
+ href_pos = attribute_find("href", attribute_keys,
506
+ num_attributes);
507
+ if(href_pos < 0) {
508
+ return FALSE;
509
+ }
510
+ if(!valid_url(attribute_values[href_pos])) {
511
+ return FALSE;
512
+ }
513
+
514
+ write_chars("<a rel=\"nofollow\" target=\"_blank\" href=\"");
515
+ write_chars(attribute_values[href_pos]);
516
+ write_chars("\">");
517
+
518
+ push_to_tag_stack("a", 1);
519
+ anchors_in_stack++;
520
+ }
521
+ else if(strcmp("img", name) == 0) {
522
+ int src_pos, alt_pos, width_pos, height_pos;
523
+
524
+ src_pos = attribute_find("src", attribute_keys,
525
+ num_attributes);
526
+ alt_pos = attribute_find("alt", attribute_keys,
527
+ num_attributes);
528
+ width_pos = attribute_find("width", attribute_keys,
529
+ num_attributes);
530
+ height_pos = attribute_find("height", attribute_keys,
531
+ num_attributes);
532
+ if(src_pos < 0) {
533
+ return FALSE;
534
+ }
535
+
536
+ if(!valid_url(attribute_values[src_pos])) {
537
+ return FALSE;
538
+ }
539
+
540
+ write_chars("<img src=\"");
541
+ write_chars(attribute_values[src_pos]);
542
+ if (alt_pos >= 0){
543
+ write_chars("\" alt=\"");
544
+ write_chars(attribute_values[alt_pos]);
545
+ }
546
+ if (width_pos >= 0){
547
+ write_chars("\" width=\"");
548
+ write_chars(attribute_values[width_pos]);
549
+ }
550
+ if (height_pos >= 0){
551
+ write_chars("\" height=\"");
552
+ write_chars(attribute_values[height_pos]);
553
+ }
554
+ write_chars("\" class=\"escapedImg\"/>");
555
+ }
556
+ else {
557
+ return FALSE;
558
+ }
559
+ }
560
+ return TRUE;
561
+ }
562
+
563
+ static int gr_tag_parse() {
564
+ char *tag;
565
+ char *attributes[MAX_ATTRIBUTES];
566
+ int num_attributes = 0;
567
+ char *name;
568
+ int tag_size;
569
+ int name_size;
570
+ int i;
571
+ char *id; // attribute[1]
572
+ char *title;// attribute[2], defaulted to attr[0]
573
+ tag = input + position;
574
+ tag_size = (ahead_position - position) - 1;
575
+
576
+ if(tag_size < 3) {
577
+ return FALSE;
578
+ }
579
+
580
+ //read name
581
+ for(name_size = 0; name_size < tag_size; name_size++) {
582
+ if(tag[name_size] == ':') {
583
+ break;
584
+ }
585
+ }
586
+ name = ALLOCA_N(char, name_size + 1);
587
+ strncpy(name, tag, name_size);
588
+ name[name_size] = 0;
589
+ downcasen(name, name_size);
590
+
591
+ if(position + name_size + 1 > input_size) {
592
+ printf("Error: position + name_size + 1 > input_size\n");
593
+ return;
594
+ }
595
+
596
+ tag += name_size + 1;
597
+ tag_size -= name_size + 1;
598
+ while(tag_size > 0) {
599
+ if (num_attributes == MAX_ATTRIBUTES) {
600
+ return FALSE;
601
+ }
602
+ int attr_size;
603
+ char *attr;
604
+ for(attr_size = 0; attr_size < tag_size; attr_size++) {
605
+ if(tag[attr_size] == '|') {
606
+ break;
607
+ }
608
+ }
609
+ attr = ALLOCA_N(char, attr_size + 1);
610
+ strncpy(attr, tag, attr_size);
611
+ attr[attr_size] = 0;
612
+ tag += attr_size + 1;
613
+ tag_size -= attr_size + 1;
614
+ attributes[num_attributes++] = attr;
615
+ }
616
+
617
+ if (num_attributes < 1) {
618
+ return FALSE;
619
+ }
620
+
621
+ if (num_attributes >= 2) {
622
+ id = attributes[1];
623
+ }
624
+ else {
625
+ id = 0;
626
+ }
627
+
628
+ if (num_attributes >= 3) {
629
+ title = attributes[2];
630
+ }
631
+ else {
632
+ title = attributes[0];
633
+ }
634
+
635
+ if(strcmp("b", name) == 0 || strcmp("book", name) == 0){
636
+ if (id) {
637
+ counting_cap = FALSE;
638
+ write_chars("<a href=\"");
639
+ if (absolute_url) {
640
+ write_nchars(absolute_url, absolute_url_size);
641
+ }
642
+ write_chars("/book/show/");
643
+ write_chars(id);
644
+ write_char('.');
645
+ write_urlitized_chars(title);
646
+ write_chars("\" title=\"");
647
+ write_escaped_chars(title);
648
+ if (num_attributes >= 4) {
649
+ write_chars(" by ");
650
+ write_escaped_chars(attributes[3]);
651
+ }
652
+ write_chars("\">");
653
+ counting_cap = TRUE;
654
+ write_escaped_chars(attributes[0]);
655
+ counting_cap = FALSE;
656
+ write_chars("</a>");
657
+ counting_cap = TRUE;
658
+ }
659
+ else {
660
+ counting_cap = FALSE;
661
+ write_chars("<a href=\"");
662
+ if (absolute_url) {
663
+ write_nchars(absolute_url, absolute_url_size);
664
+ }
665
+ write_chars("/search/search?q=");
666
+ write_escaped_chars(attributes[0]);
667
+ write_chars("\" title=\"");
668
+ write_chars(title);
669
+ write_chars("\">");
670
+ counting_cap = TRUE;
671
+ write_escaped_chars(attributes[0]);
672
+ counting_cap = FALSE;
673
+ write_chars("</a>");
674
+ counting_cap = TRUE;
675
+ }
676
+ }
677
+ else if(strcmp("bc", name) == 0){
678
+ if (num_attributes >= 5) {
679
+ counting_cap = FALSE;
680
+ write_chars("<a href=\"");
681
+ if (absolute_url) {
682
+ write_nchars(absolute_url, absolute_url_size);
683
+ }
684
+ write_chars("/book/show/");
685
+ write_chars(id);
686
+ write_char('.');
687
+ write_urlitized_chars(title);
688
+ write_chars("\"><img src=\"");
689
+ write_chars(attributes[4]);
690
+ write_chars("\" title=\"");
691
+ write_escaped_chars(title);
692
+ if (num_attributes >= 4) {
693
+ write_chars(" by ");
694
+ write_escaped_chars(attributes[3]);
695
+ }
696
+ write_chars("\" alt=\"");
697
+ write_escaped_chars(title);
698
+ write_chars("\"/></a>");
699
+ counting_cap = TRUE;
700
+ }
701
+ else {
702
+ write_chars("[bookcover:");
703
+ write_escaped_chars(attributes[0]);
704
+ write_char(']');
705
+ }
706
+ }
707
+ else if(strcmp("a", name) == 0 || strcmp("author", name) == 0){
708
+ if (id) {
709
+ counting_cap = FALSE;
710
+ write_chars("<a href=\"");
711
+ if (absolute_url) {
712
+ write_nchars(absolute_url, absolute_url_size);
713
+ }
714
+ write_chars("/author/show/");
715
+ write_chars(id);
716
+ write_char('.');
717
+ write_urlitized_chars(title);
718
+ write_chars("\" title=\"");
719
+ write_escaped_chars(title);
720
+ write_chars("\">");
721
+ counting_cap = TRUE;
722
+ write_escaped_chars(attributes[0]);
723
+ counting_cap = FALSE;
724
+ write_chars("</a>");
725
+ counting_cap = TRUE;
726
+ }
727
+ else {
728
+ counting_cap = FALSE;
729
+ write_chars("<a href=\"");
730
+ if (absolute_url) {
731
+ write_nchars(absolute_url, absolute_url_size);
732
+ }
733
+ write_chars("/search/search?q=");
734
+ write_escaped_chars(attributes[0]);
735
+ write_chars("\" title=\"");
736
+ write_chars(title);
737
+ write_chars("\">");
738
+ counting_cap = TRUE;
739
+ write_escaped_chars(attributes[0]);
740
+ counting_cap = FALSE;
741
+ write_chars("</a>");
742
+ counting_cap = TRUE;
743
+ }
744
+ }
745
+ else if(strcmp("ai", name) == 0){
746
+ if (num_attributes >= 4) {
747
+ counting_cap = FALSE;
748
+ write_chars("<a href=\"");
749
+ if (absolute_url) {
750
+ write_nchars(absolute_url, absolute_url_size);
751
+ }
752
+ write_chars("/author/show/");
753
+ write_chars(id);
754
+ write_char('.');
755
+ write_urlitized_chars(title);
756
+ write_chars("\"><img src=\"");
757
+ write_chars(attributes[3]);
758
+ write_chars("\" title=\"");
759
+ write_escaped_chars(title);
760
+ write_chars("\" alt=\"");
761
+ write_escaped_chars(title);
762
+ write_chars("\"/></a>");
763
+ counting_cap = TRUE;
764
+ }
765
+ else {
766
+ write_chars("[authorimage:");
767
+ write_escaped_chars(attributes[0]);
768
+ write_char(']');
769
+ }
770
+ }
771
+ else {
772
+ return FALSE;
773
+ }
774
+ return TRUE;
775
+ }
776
+
777
+ static int html_read() {
778
+ ahead_position = position;
779
+
780
+ while(ahead_position < input_size) {
781
+ switch(input[ahead_position++]) {
782
+ case '<':
783
+ return FALSE;
784
+ case '>':
785
+ return html_parse(position, ahead_position - 1);
786
+ }
787
+ }
788
+ return FALSE;
789
+ }
790
+
791
+ static int gr_tag_read() {
792
+ ahead_position = position;
793
+
794
+ // reading name
795
+ while(ahead_position < input_size) {
796
+ switch(input[ahead_position++]) {
797
+ case '[':
798
+ return FALSE;
799
+ case ']':
800
+ return gr_tag_parse();
801
+ }
802
+ }
803
+
804
+ return FALSE;
805
+ }
806
+
807
+ static int url_read() {
808
+ char *url;
809
+ char *url_downcase;
810
+ char c;
811
+ int i;
812
+ int url_size;
813
+ int has_http = FALSE;
814
+
815
+
816
+ if(anchors_in_stack > 0) {
817
+ return FALSE;
818
+ }
819
+
820
+ ahead_position = position;
821
+ url_size = 0;
822
+
823
+ while(ahead_position < input_size) {
824
+ int done = FALSE;
825
+ switch(input[ahead_position++]) {
826
+ case '"':
827
+ case '<':
828
+ case '>':
829
+ return FALSE;
830
+ case ' ':
831
+ case '\n':
832
+ case '\t':
833
+ case '(':
834
+ case ')':
835
+ ahead_position--;
836
+ done = TRUE;
837
+ break;
838
+ }
839
+ if(done) {
840
+ break;
841
+ }
842
+ }
843
+ url_size = 1 + ahead_position - position;
844
+
845
+ if (url_size < 5) {
846
+ return FALSE;
847
+ }
848
+
849
+ url = ALLOCA_N(char, url_size + 1);
850
+ strncpy(url, input + position - 1, url_size);
851
+ url[url_size] = 0;
852
+
853
+
854
+ url_downcase = ALLOCA_N(char, url_size + 1);
855
+ strncpy(url_downcase, url, url_size);
856
+ downcasen(url_downcase, url_size);
857
+
858
+
859
+ if (strncmp(url_downcase, "http://", 7) == 0) {
860
+ has_http = TRUE;
861
+ }
862
+ else if (strncmp(url_downcase, "https://", 8) == 0) {
863
+ has_http = TRUE;
864
+ }
865
+ else {
866
+ //try and decide if the its a url without 'http' in front
867
+ int has_www = FALSE;
868
+ int last_dot = -1;
869
+ int done = TRUE;
870
+ int tld_size;
871
+
872
+ //does it start with www.?
873
+ if (strncmp(url_downcase, "www.", 4) == 0) {
874
+ has_www = TRUE;
875
+ i = 4;
876
+ last_dot = 3;
877
+ }
878
+ else {
879
+ i = 0;
880
+ }
881
+
882
+ // see if it starts with a properly formed domain name
883
+ for(; i < url_size; i++) {
884
+ c = url_downcase[i];
885
+ if (c == '.') {
886
+ //starting with a period is invalid
887
+ if(i == 0) {
888
+ return FALSE;
889
+ }
890
+
891
+ //two periods in a row is invalid!
892
+ if(last_dot + 1 == i){
893
+ return FALSE;
894
+ }
895
+ last_dot = i;
896
+ }
897
+ else if (c == '/') {
898
+ // a slash means we're no longer reading a domain name
899
+ break;
900
+ }
901
+ else if ((c >= 'a' && c <= 'z') || c == '-' ||
902
+ (c >= '0' && c <= '9')) {
903
+ // valid domain name characters
904
+ }
905
+ else {
906
+ // domains must be made up of those other characters
907
+ return FALSE;
908
+ }
909
+ }
910
+
911
+ if(last_dot == -1) { // no periods were found
912
+ return FALSE;
913
+ }
914
+ tld_size = (i - last_dot) - 1;
915
+ if (has_www) {
916
+ if (tld_size < 2){
917
+ return FALSE;
918
+ }
919
+ }
920
+ else {
921
+ char *tld;
922
+ if (tld_size != 3){
923
+ return FALSE;
924
+ }
925
+ tld = url + last_dot + 1;
926
+ if(strncmp(tld, "com", 3) != 0 &&
927
+ strncmp(tld, "net", 3) != 0 &&
928
+ strncmp(tld, "org", 3) != 0 &&
929
+ strncmp(tld, "gov", 3) != 0){
930
+ // not a tld we autogenerate for!
931
+ return FALSE;
932
+ }
933
+ }
934
+
935
+ }
936
+
937
+ //OK, now its probably ok to generate the url
938
+ counting_cap = FALSE;
939
+ write_chars("<a rel=\"nofollow\" target=\"_blank\" href=\"");
940
+ if (!has_http) {
941
+ write_chars("http://");
942
+ }
943
+ write_chars(url);
944
+ if (url_size > MAX_URL_PRINT_SIZE) {
945
+ write_chars("\" title=\"");
946
+ write_chars(url);
947
+ }
948
+ write_chars("\">");
949
+ counting_cap = TRUE;
950
+ for(i = 0; i < MAX_URL_PRINT_SIZE && i < url_size; i++) {
951
+ switch(c = url[i]) {
952
+ case '&':
953
+ write_chars("&amp;");
954
+ break;
955
+ default:
956
+ write_char(c);
957
+ break;
958
+ }
959
+ }
960
+ if(i < url_size) {
961
+ write_chars("...");
962
+ }
963
+ counting_cap = FALSE;
964
+ write_chars("</a>");
965
+ counting_cap = TRUE;
966
+ return TRUE;
967
+ }
968
+
969
+
970
+ static int amp_read() {
971
+ int amp_escape_count = 0;
972
+ int poundsign = FALSE;
973
+ ahead_position = position;
974
+
975
+ // reading name
976
+ while(ahead_position < input_size) {
977
+ char c;
978
+ c = input[ahead_position++];
979
+ if (c == '#') {
980
+ if(amp_escape_count == 0) {
981
+ poundsign = TRUE;
982
+ }
983
+ else {
984
+ return FALSE;
985
+ }
986
+ }
987
+ else if(c >= '0' && c <= '9') {
988
+ }
989
+ else if((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')) {
990
+ if (poundsign) {
991
+ return FALSE;
992
+ }
993
+ }
994
+ else if (c == ';') {
995
+ if (amp_escape_count >= 2){
996
+ write_nchars(input + position - 1, amp_escape_count + 2);
997
+ return TRUE;
998
+ }
999
+ else {
1000
+ return FALSE;
1001
+ }
1002
+ }
1003
+ else {
1004
+ return FALSE;
1005
+ }
1006
+
1007
+ if(amp_escape_count++ > 6) {
1008
+ return FALSE;
1009
+ }
1010
+ }
1011
+
1012
+ return FALSE;
1013
+ }
1014
+
1015
+
1016
+ static VALUE t_parse(VALUE self, VALUE r_string, VALUE r_cap, VALUE r_cap_string) {
1017
+ char c;
1018
+ int try_url = TRUE;
1019
+ int cap_at;
1020
+ char * cap_string;
1021
+ VALUE result;
1022
+ tag_stack_size = 0;
1023
+ anchors_in_stack = 0;
1024
+ counting_cap = TRUE;
1025
+ cap_count = 0;
1026
+ writing_utf_8 = 0;
1027
+ position = ahead_position = 0;
1028
+
1029
+ //de-ruby all the inputs!
1030
+ r_string = StringValue(r_string);
1031
+ input_size = RSTRING(r_string)->len;
1032
+ input = RSTRING(r_string)->ptr;
1033
+
1034
+ if (NIL_P(r_cap)) {
1035
+ cap_at = 0;
1036
+ }
1037
+ else {
1038
+ cap_at = NUM2INT(r_cap);
1039
+ }
1040
+
1041
+ cap_string = StringValuePtr(r_cap_string);
1042
+
1043
+ max_output_size = input_size * 10 + 128;
1044
+ output = ALLOCA_N(char, max_output_size);
1045
+ output_size = 0;
1046
+
1047
+ tag_stack = ALLOCA_N(char *, MAX_STACK_DEPTH);
1048
+
1049
+ while(position < input_size) {
1050
+
1051
+ if(cap_at && cap_count >= cap_at) {
1052
+ write_chars(cap_string);
1053
+ break;
1054
+ }
1055
+ switch(c = input[position++]) {
1056
+ case '&':
1057
+ counting_cap = FALSE;
1058
+ if(amp_read()) {
1059
+ position = ahead_position;
1060
+ }
1061
+ else {
1062
+ write_chars("&amp;");
1063
+ }
1064
+ counting_cap = TRUE;
1065
+ try_url = FALSE;
1066
+ cap_count++;
1067
+ break;
1068
+ case '>':
1069
+ try_url = FALSE;
1070
+ write_chars("&gt;");
1071
+ break;
1072
+ case '<':
1073
+ counting_cap = FALSE;
1074
+ if(html_read()) {
1075
+ position = ahead_position;
1076
+ try_url = TRUE;
1077
+ counting_cap = TRUE;
1078
+ }
1079
+ else {
1080
+ counting_cap = TRUE;
1081
+ write_chars("&lt;");
1082
+ try_url = FALSE;
1083
+ }
1084
+ break;
1085
+ case '[':
1086
+ counting_cap = FALSE;
1087
+ if(gr_tag_read()) {
1088
+ position = ahead_position;
1089
+ try_url = TRUE;
1090
+ counting_cap = TRUE;
1091
+ }
1092
+ else {
1093
+ try_url = FALSE;
1094
+ counting_cap = TRUE;
1095
+ write_char('[');
1096
+ }
1097
+ break;
1098
+ case '"':
1099
+ try_url = FALSE;
1100
+ write_chars("&quot;");
1101
+ break;
1102
+ case '\n':
1103
+ write_chars("<br/>");
1104
+ try_url = TRUE;
1105
+ break;
1106
+ case ' ':
1107
+ case '\t':
1108
+ case '(':
1109
+ case ')':
1110
+ write_char(c);
1111
+ try_url = TRUE;
1112
+ break;
1113
+ default:
1114
+ if(try_url){
1115
+ if(url_read()) {
1116
+ position = ahead_position;
1117
+ }
1118
+ else {
1119
+ write_char(c);
1120
+ }
1121
+ try_url = FALSE;
1122
+ }
1123
+ else {
1124
+ write_char(c);
1125
+ }
1126
+ break;
1127
+ }
1128
+ }
1129
+ while(tag_stack_size > 0) {
1130
+ char *item = tag_stack[--tag_stack_size];
1131
+ write_chars("</");
1132
+ write_chars(item);
1133
+ free(item);
1134
+ write_char('>');
1135
+ }
1136
+ return rb_str_new(output, output_size);
1137
+ }
1138
+
1139
+
1140
+ static VALUE t_set_absolute_url(VALUE self, VALUE r_string) {
1141
+ int new_size;
1142
+ char *new_url;
1143
+ r_string = StringValue(r_string);
1144
+ absolute_url_size = RSTRING(r_string)->len;
1145
+ new_url = RSTRING(r_string)->ptr;
1146
+ if (absolute_url) {
1147
+ free(absolute_url);
1148
+ }
1149
+ absolute_url = malloc(absolute_url_size);
1150
+ strncpy(absolute_url, new_url, absolute_url_size);
1151
+ return r_string;
1152
+ }
1153
+
1154
+
1155
+ VALUE cTest;
1156
+
1157
+ void Init_gr_string_escape() {
1158
+ cTest = rb_define_class("GrStringEscape", rb_cObject);
1159
+ rb_define_method(cTest, "parse", t_parse, 3);
1160
+ rb_define_method(cTest, "set_absolute_url", t_set_absolute_url, 1);
1161
+ id_push = rb_intern("push");
1162
+ }