gr_string_escape 0.3.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,5 @@
1
+ README.rdoc
2
+ lib/**/*.rb
3
+ bin/*
4
+ features/**/*.feature
5
+ LICENSE
@@ -0,0 +1,22 @@
1
+ ## MAC OS
2
+ .DS_Store
3
+
4
+ ## TEXTMATE
5
+ *.tmproj
6
+ tmtags
7
+
8
+ ## EMACS
9
+ *~
10
+ \#*
11
+ .\#*
12
+
13
+ ## VIM
14
+ *.swp
15
+
16
+ ## PROJECT::GENERAL
17
+ *.bundle
18
+ coverage
19
+ rdoc
20
+ pkg
21
+ tmp
22
+ ## PROJECT::SPECIFIC
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2009 Curtis Schofield
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,17 @@
1
+ = gr_string_escape
2
+
3
+ Description goes here.
4
+
5
+ == Note on Patches/Pull Requests
6
+
7
+ * Fork the project.
8
+ * Make your feature addition or bug fix.
9
+ * Add tests for it. This is important so I don't break it in a
10
+ future version unintentionally.
11
+ * Commit, do not mess with rakefile, version, or history.
12
+ (if you want to have your own version, that is fine but bump version in a commit by itself I can ignore when I pull)
13
+ * Send me a pull request. Bonus points for topic branches.
14
+
15
+ == Copyright
16
+
17
+ Copyright (c) 2010 Curtis Schofield. See LICENSE for details.
@@ -0,0 +1,61 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+ jeweler_tasks = nil
4
+ begin
5
+ require 'jeweler'
6
+ jeweler_tasks = Jeweler::Tasks.new do |gem|
7
+ gem.name = "gr_string_escape"
8
+ gem.summary = %Q{Goodreads string parser}
9
+ gem.description = %Q{Code for Goodreads String Parsing}
10
+ gem.email = "github.com@robotarmyma.de"
11
+ gem.homepage = "http://github.com/robotarmy/gr_string_escape"
12
+ gem.authors = ["Michael Economy","Curtis Schofield"]
13
+ gem.extensions = FileList['ext/**/extconf.rb']
14
+ # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
15
+ end
16
+ Jeweler::GemcutterTasks.new
17
+ rescue LoadError
18
+ puts "Jeweler missing : \n gem install jeweler"
19
+ end
20
+ begin
21
+ require 'rake/extensiontask'
22
+ rescue LoadError
23
+ puts "rake-compiler missing : \n gem install rake-compiler"
24
+ end
25
+ Rake::ExtensionTask.new('gr_string_escape', jeweler_tasks.gemspec)
26
+ CLEAN.include 'lib/**/*.so'
27
+
28
+
29
+ require 'rake/testtask'
30
+ Rake::TestTask.new(:test) do |test|
31
+ test.libs << 'lib' << 'test'
32
+ test.pattern = 'test/**/test_*.rb'
33
+ test.verbose = true
34
+ end
35
+
36
+ begin
37
+ require 'rcov/rcovtask'
38
+ Rcov::RcovTask.new do |test|
39
+ test.libs << 'test'
40
+ test.pattern = 'test/**/test_*.rb'
41
+ test.verbose = true
42
+ end
43
+ rescue LoadError
44
+ task :rcov do
45
+ abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
46
+ end
47
+ end
48
+
49
+ task :test => :check_dependencies
50
+
51
+ task :default => [:compile,:test]
52
+
53
+ require 'rake/rdoctask'
54
+ Rake::RDocTask.new do |rdoc|
55
+ version = File.exist?('VERSION') ? File.read('VERSION') : ""
56
+
57
+ rdoc.rdoc_dir = 'rdoc'
58
+ rdoc.title = "gr_string_escape #{version}"
59
+ rdoc.rdoc_files.include('README*')
60
+ rdoc.rdoc_files.include('lib/**/*.rb')
61
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.3.1
@@ -0,0 +1,4 @@
1
+ require 'mkmf'
2
+ extention_name = "gr_string_escape"
3
+ dir_config(extention_name)
4
+ create_makefile(extention_name)
@@ -0,0 +1,1162 @@
1
+ #include "ruby.h"
2
+ #include <stdio.h>
3
+ #include <stdlib.h>
4
+ #include <string.h>
5
+
6
+
7
+ #define MAX_STACK_DEPTH 20
8
+ #define MAX_URL_PRINT_SIZE 40
9
+ #define MAX_ATTRIBUTES 10
10
+ #define DYNAMICS_INCREMENT 128
11
+ #define STRONG "strong"
12
+ #define EM "em"
13
+ #define FALSE 0
14
+ #define TRUE 1
15
+ #define EMPTY_STRING ""
16
+
17
+ static int id_push; // ruby thing
18
+
19
+ int input_size, output_size, max_output_size, absolute_url_size;
20
+ int position;
21
+ int ahead_position;
22
+ char **tag_stack;
23
+ int tag_stack_size;
24
+ int anchors_in_stack;
25
+ int cap_count;
26
+ int writing_utf_8; //set to the size of the utf_8 char remaining
27
+ int counting_cap;
28
+
29
+ char* input;
30
+ char* absolute_url;
31
+ char* output;
32
+
33
+ static void downcase(char *string) {
34
+ int i;
35
+ for(i=0; string[i]; i++) {
36
+ if (string[i] >= 'A' && string[i] <= 'Z'){
37
+ string[i] += 32;
38
+ }
39
+ }
40
+ }
41
+
42
+ static void downcasen(char *string, int string_size) {
43
+ int i;
44
+ //downcase
45
+ for(i=0; i < string_size; i++) {
46
+ if (string[i] >= 'A' && string[i] <= 'Z'){
47
+ string[i] += 32;
48
+ }
49
+ }
50
+ }
51
+
52
+ static void write_chars(char *chars) {
53
+ int size = strlen(chars);
54
+ if(output_size + size >= max_output_size) {
55
+ printf("Error: max_output_size is being exceeded\n");
56
+ return;
57
+ }
58
+ strncpy(output + output_size, chars, size);
59
+ output_size += size;
60
+ if(counting_cap) {
61
+ cap_count += size;
62
+ }
63
+ }
64
+
65
+ inline push_to_tag_stack(char *string, int string_size) {
66
+ char *new_string;
67
+ new_string = ALLOC_N(char, string_size + 1);
68
+ strcpy(new_string, string);
69
+ tag_stack[tag_stack_size++] = new_string;
70
+ }
71
+
72
+ inline write_nchars(char *chars, int chars_size) {
73
+ if(output_size + chars_size >= max_output_size) {
74
+ printf("Error: max_output_size is being exceeded\n");
75
+ return;
76
+ }
77
+ strncpy(output + output_size, chars, chars_size);
78
+ output_size += chars_size;
79
+ if(counting_cap) {
80
+ cap_count += chars_size;
81
+ }
82
+ }
83
+
84
+ inline void write_char(char char_to_write) {
85
+ if(output_size + 1 >= max_output_size) {
86
+ printf("Error: max_output_size is being exceeded\n");
87
+ return;
88
+ }
89
+
90
+ output[output_size++] = char_to_write;
91
+ if (writing_utf_8) {
92
+ if(char_to_write & 0x80 && !(char_to_write & 0x40)) {
93
+ writing_utf_8 --;
94
+ }
95
+ else {
96
+ writing_utf_8 = 0;
97
+ }
98
+ }
99
+ else {
100
+ if(char_to_write & 0x80) {
101
+ if (char_to_write & 0x40) {
102
+ writing_utf_8 ++;
103
+ if (char_to_write & 0x20) {
104
+ writing_utf_8 ++;
105
+ if (char_to_write & 0x10) {
106
+ writing_utf_8 ++;
107
+ }
108
+ }
109
+ }
110
+ }
111
+ }
112
+ if(counting_cap && !writing_utf_8) {
113
+ cap_count++;
114
+ }
115
+ }
116
+
117
+
118
+ static void write_escaped_chars(char *chars) {
119
+ int i = 0;
120
+ char c;
121
+ for (i = 0; c = chars[i]; i++) {
122
+ switch(c) {
123
+ case '&':
124
+ write_chars("&amp;");
125
+ break;
126
+ case '>':
127
+ write_chars("&gt;");
128
+ break;
129
+ case '<':
130
+ write_chars("&lt;");
131
+ break;
132
+ case '"':
133
+ write_chars("&quot;");
134
+ break;
135
+ default:
136
+ write_char(c);
137
+ break;
138
+ }
139
+ }
140
+ }
141
+
142
+
143
+
144
+
145
+ static void write_urlitized_chars(char *chars) {
146
+ int i = 0;
147
+ int wrote_underscore = FALSE;
148
+ char c;
149
+ for (i = 0; c = chars[i]; i++) {
150
+ if((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') ||
151
+ (c >= '0' && c <= '9')) {
152
+ write_char(c);
153
+ wrote_underscore = FALSE;
154
+ }
155
+ else {
156
+ if (!wrote_underscore) {
157
+ wrote_underscore = TRUE;
158
+ write_char('_');
159
+ }
160
+ }
161
+ }
162
+ }
163
+
164
+ static int valid_url(char * string){
165
+ if (!string || strlen(string) <= 4) {
166
+ return FALSE;
167
+ }
168
+ // disallow javascript tags
169
+ if(string[0] != 'j' && string[0] != 'J') {
170
+ return TRUE;
171
+ }
172
+
173
+ if(string[1] != 'a' && string[1] != 'A') {
174
+ return TRUE;
175
+ }
176
+
177
+ if(string[2] != 'v' && string[2] != 'V') {
178
+ return TRUE;
179
+ }
180
+
181
+ if(string[3] != 'a' && string[3] != 'A') {
182
+ return TRUE;
183
+ }
184
+ return FALSE;
185
+ }
186
+
187
+
188
+ static int attribute_find(char *key, char **keys, int num_keys) {
189
+ int i;
190
+
191
+ // printf("finding attr: %s\n", key);
192
+ for(i=0; i< num_keys; i++) {
193
+ if(strcmp(key, keys[i]) == 0) {
194
+ // printf("found!: %d\n", i);
195
+ return i;
196
+ }
197
+ }
198
+ return -1;
199
+ }
200
+
201
+ static int html_parse(int start, int finish) {
202
+ int close_tag = FALSE;
203
+ int tag_closed = FALSE;
204
+ char *tag;
205
+ char *name;
206
+ char c;
207
+ int i;
208
+ char *attribute_keys[MAX_ATTRIBUTES];
209
+ char *attribute_values[MAX_ATTRIBUTES];
210
+ int num_attributes = 0;
211
+ int tag_size;
212
+ int name_size = 0;
213
+
214
+ tag = input + start;
215
+ tag_size = finish - start;
216
+
217
+ // printf("tag start: %c tag_size:%d\n", tag[0], tag_size);
218
+ if(tag_size <= 0) {
219
+ return FALSE;
220
+ }
221
+
222
+ // read all the whitespace and first slash "< / "
223
+ for(; tag_size > 0; tag++, tag_size--) {
224
+ int done = FALSE;
225
+ switch(c = tag[0]) {
226
+ case '/':
227
+ if (close_tag){
228
+ return FALSE;
229
+ }
230
+ else {
231
+ close_tag = TRUE;
232
+ }
233
+ break;
234
+ case ' ':
235
+ case '\t':
236
+ case '\n':
237
+ // ignore
238
+ break;
239
+ default:
240
+ done = TRUE;
241
+ break;
242
+ }
243
+ if(done){
244
+ break;
245
+ }
246
+ }
247
+
248
+ // read all the whitespace and last slash
249
+ for(; tag_size > 0; tag_size--) {
250
+ int done = FALSE;
251
+ switch(c = tag[tag_size - 1]) {
252
+ case '/':
253
+ if (tag_closed){
254
+ return FALSE;
255
+ }
256
+ else {
257
+ tag_closed = TRUE;
258
+ }
259
+ break;
260
+ case ' ':
261
+ case '\t':
262
+ case '\n':
263
+ // ignore
264
+ break;
265
+ default:
266
+ done = TRUE;
267
+ break;
268
+ }
269
+ if(done){
270
+ break;
271
+ }
272
+ }
273
+
274
+ if(tag_size == 0) {
275
+ return FALSE;
276
+ }
277
+
278
+ //read name
279
+ for(name_size = 0; name_size < tag_size; name_size++) {
280
+ int done = FALSE;
281
+ switch(tag[name_size]) {
282
+ case ' ':
283
+ case '\t':
284
+ case '\n':
285
+ done = TRUE;
286
+ break;
287
+ }
288
+ if(done) {
289
+ break;
290
+ }
291
+ }
292
+ name = ALLOCA_N(char, name_size + 1);
293
+ strncpy(name, tag, name_size);
294
+ name[name_size] = 0;
295
+
296
+ // printf("name_size %d tag name: %s\n", name_size, name);
297
+
298
+ tag_size -= name_size;
299
+ tag += name_size;
300
+
301
+
302
+ // read attributes
303
+ for(; tag_size > 0; tag_size--, tag++) {
304
+ char *key = tag;
305
+ char *value = NULL;
306
+ int key_size = 0;
307
+ int value_size = 0; // would you like fries with that?
308
+ int skip_value = FALSE;
309
+
310
+ // read key
311
+ for(; key_size < tag_size; key_size++) {
312
+ int done = FALSE;
313
+ c = tag[key_size];
314
+ switch(c) {
315
+ case ' ':
316
+ case '\t':
317
+ case '\n':
318
+ if (key_size > 0) {
319
+ done = TRUE;
320
+ }
321
+ else {
322
+ //ignore preceding whitespace
323
+ key++;
324
+ key_size--;
325
+ tag++;
326
+ tag_size--;
327
+ }
328
+ break;
329
+ case '=':
330
+ if (key_size > 0) {
331
+ done = TRUE;
332
+ // do not advance, equals still needs to be found
333
+ }
334
+ else {
335
+ return FALSE;
336
+ }
337
+ break;
338
+ case '\'':
339
+ case '"':
340
+ return FALSE;
341
+ }
342
+ if(done) {
343
+ break;
344
+ }
345
+ } // got key
346
+ key = ALLOCA_N(char, key_size + 1);
347
+ strncpy(key, tag, key_size);
348
+ key[key_size] = 0;
349
+ tag += key_size;
350
+ tag_size -= key_size;
351
+
352
+ for(;tag_size > 0; tag++, tag_size--) {
353
+ int done = FALSE;
354
+ switch(tag[0]) {
355
+ case ' ':
356
+ case '\t':
357
+ case '\n':
358
+ //ignore whitespace
359
+ break;
360
+ case '=':
361
+ done = TRUE;
362
+ tag++;
363
+ tag_size--;
364
+ break;
365
+ default:
366
+ skip_value = TRUE;
367
+ done = TRUE;
368
+ break;
369
+ }
370
+ if(done) {
371
+ break;
372
+ }
373
+ }
374
+
375
+ if(!skip_value) {
376
+ // read value
377
+ int started_reading_value = FALSE;
378
+ for(; value_size < tag_size; value_size++) {
379
+ int done = FALSE;
380
+ c = tag[value_size];
381
+ switch(c) {
382
+ case ' ':
383
+ case '\t':
384
+ case '\n':
385
+ if (started_reading_value) {
386
+ done = TRUE;
387
+ }
388
+ else {
389
+ //ignore preceding whitespace
390
+ value_size--;
391
+ tag++;
392
+ tag_size--;
393
+ }
394
+ break;
395
+ case '=':
396
+ return FALSE;
397
+ case '\'':
398
+ case '"':
399
+ if(!value) {
400
+ started_reading_value = TRUE;
401
+ tag++;
402
+ tag_size --;
403
+ while(value_size <= tag_size && tag[value_size] != c) {
404
+ value_size++;
405
+ }
406
+ if(tag[value_size] != c){
407
+ return FALSE;
408
+ }
409
+ done = TRUE;
410
+ }
411
+ else {
412
+ return FALSE;
413
+ }
414
+ break;
415
+ default:
416
+ if (!started_reading_value) {
417
+ started_reading_value = TRUE;
418
+ }
419
+ break;
420
+ }
421
+ if(done) {
422
+ break;
423
+ }
424
+ }
425
+
426
+ if(started_reading_value) {
427
+ value = ALLOCA_N(char, value_size + 1);
428
+ strncpy(value, tag, value_size);
429
+ value[value_size] = 0;
430
+ tag += value_size;
431
+ tag_size -= value_size;
432
+ }
433
+ }
434
+
435
+ if (key_size > 0) {
436
+ if (num_attributes >= MAX_ATTRIBUTES) {
437
+ break;
438
+ }
439
+ attribute_keys[num_attributes] = key;
440
+ if (value) {
441
+ attribute_values[num_attributes] = value;
442
+ }
443
+ else {
444
+ attribute_values[num_attributes] = EMPTY_STRING;
445
+ }
446
+ num_attributes ++;
447
+ }
448
+ }
449
+
450
+ for(i = 0; i< num_attributes; i++) {
451
+ downcase(attribute_keys[i]);
452
+ }
453
+
454
+ //clean tag!
455
+ downcasen(name, name_size);
456
+ if(strcmp(name, "b") == 0){
457
+ name = STRONG;
458
+ }
459
+ else if(strcmp(name, "i") == 0) {
460
+ name = EM;
461
+ }
462
+
463
+ if(close_tag) {
464
+ if(tag_stack_size == 0){
465
+ return FALSE;
466
+ }
467
+ if(strcmp(tag_stack[tag_stack_size - 1], name) == 0){
468
+ if(strcmp(name, "a")) {
469
+ anchors_in_stack--;
470
+ }
471
+ write_chars("</");
472
+ write_chars(tag_stack[--tag_stack_size]);
473
+ write_char('>');
474
+ }
475
+ }
476
+ else { //not a close tag
477
+ if(tag_stack_size >= MAX_STACK_DEPTH) {
478
+ return FALSE;
479
+ }
480
+ if(strcmp(EM, name) == 0 || strcmp(STRONG, name) == 0 ||
481
+ strcmp("s", name) == 0 || strcmp("u", name) == 0 ||
482
+ strcmp("p", name) == 0 || strcmp("blockquote", name) == 0 ||
483
+ strcmp("pre", name) == 0){
484
+ if(tag_closed) {
485
+ return FALSE;
486
+ }
487
+ if(num_attributes != 0) {
488
+ return FALSE;
489
+ }
490
+ push_to_tag_stack(name, name_size);
491
+ write_char('<');
492
+ write_chars(name);
493
+ write_char('>');
494
+ }
495
+ else if(strcmp("br", name) == 0) {
496
+ if(num_attributes != 0) {
497
+ return FALSE;
498
+ }
499
+ write_chars("<br/>");
500
+ }
501
+ else if(strcmp("a", name) == 0) {
502
+ int href_pos;
503
+
504
+ // printf("trying A tag\n");
505
+ href_pos = attribute_find("href", attribute_keys,
506
+ num_attributes);
507
+ if(href_pos < 0) {
508
+ return FALSE;
509
+ }
510
+ if(!valid_url(attribute_values[href_pos])) {
511
+ return FALSE;
512
+ }
513
+
514
+ write_chars("<a rel=\"nofollow\" target=\"_blank\" href=\"");
515
+ write_chars(attribute_values[href_pos]);
516
+ write_chars("\">");
517
+
518
+ push_to_tag_stack("a", 1);
519
+ anchors_in_stack++;
520
+ }
521
+ else if(strcmp("img", name) == 0) {
522
+ int src_pos, alt_pos, width_pos, height_pos;
523
+
524
+ src_pos = attribute_find("src", attribute_keys,
525
+ num_attributes);
526
+ alt_pos = attribute_find("alt", attribute_keys,
527
+ num_attributes);
528
+ width_pos = attribute_find("width", attribute_keys,
529
+ num_attributes);
530
+ height_pos = attribute_find("height", attribute_keys,
531
+ num_attributes);
532
+ if(src_pos < 0) {
533
+ return FALSE;
534
+ }
535
+
536
+ if(!valid_url(attribute_values[src_pos])) {
537
+ return FALSE;
538
+ }
539
+
540
+ write_chars("<img src=\"");
541
+ write_chars(attribute_values[src_pos]);
542
+ if (alt_pos >= 0){
543
+ write_chars("\" alt=\"");
544
+ write_chars(attribute_values[alt_pos]);
545
+ }
546
+ if (width_pos >= 0){
547
+ write_chars("\" width=\"");
548
+ write_chars(attribute_values[width_pos]);
549
+ }
550
+ if (height_pos >= 0){
551
+ write_chars("\" height=\"");
552
+ write_chars(attribute_values[height_pos]);
553
+ }
554
+ write_chars("\" class=\"escapedImg\"/>");
555
+ }
556
+ else {
557
+ return FALSE;
558
+ }
559
+ }
560
+ return TRUE;
561
+ }
562
+
563
+ static int gr_tag_parse() {
564
+ char *tag;
565
+ char *attributes[MAX_ATTRIBUTES];
566
+ int num_attributes = 0;
567
+ char *name;
568
+ int tag_size;
569
+ int name_size;
570
+ int i;
571
+ char *id; // attribute[1]
572
+ char *title;// attribute[2], defaulted to attr[0]
573
+ tag = input + position;
574
+ tag_size = (ahead_position - position) - 1;
575
+
576
+ if(tag_size < 3) {
577
+ return FALSE;
578
+ }
579
+
580
+ //read name
581
+ for(name_size = 0; name_size < tag_size; name_size++) {
582
+ if(tag[name_size] == ':') {
583
+ break;
584
+ }
585
+ }
586
+ name = ALLOCA_N(char, name_size + 1);
587
+ strncpy(name, tag, name_size);
588
+ name[name_size] = 0;
589
+ downcasen(name, name_size);
590
+
591
+ if(position + name_size + 1 > input_size) {
592
+ printf("Error: position + name_size + 1 > input_size\n");
593
+ return;
594
+ }
595
+
596
+ tag += name_size + 1;
597
+ tag_size -= name_size + 1;
598
+ while(tag_size > 0) {
599
+ if (num_attributes == MAX_ATTRIBUTES) {
600
+ return FALSE;
601
+ }
602
+ int attr_size;
603
+ char *attr;
604
+ for(attr_size = 0; attr_size < tag_size; attr_size++) {
605
+ if(tag[attr_size] == '|') {
606
+ break;
607
+ }
608
+ }
609
+ attr = ALLOCA_N(char, attr_size + 1);
610
+ strncpy(attr, tag, attr_size);
611
+ attr[attr_size] = 0;
612
+ tag += attr_size + 1;
613
+ tag_size -= attr_size + 1;
614
+ attributes[num_attributes++] = attr;
615
+ }
616
+
617
+ if (num_attributes < 1) {
618
+ return FALSE;
619
+ }
620
+
621
+ if (num_attributes >= 2) {
622
+ id = attributes[1];
623
+ }
624
+ else {
625
+ id = 0;
626
+ }
627
+
628
+ if (num_attributes >= 3) {
629
+ title = attributes[2];
630
+ }
631
+ else {
632
+ title = attributes[0];
633
+ }
634
+
635
+ if(strcmp("b", name) == 0 || strcmp("book", name) == 0){
636
+ if (id) {
637
+ counting_cap = FALSE;
638
+ write_chars("<a href=\"");
639
+ if (absolute_url) {
640
+ write_nchars(absolute_url, absolute_url_size);
641
+ }
642
+ write_chars("/book/show/");
643
+ write_chars(id);
644
+ write_char('.');
645
+ write_urlitized_chars(title);
646
+ write_chars("\" title=\"");
647
+ write_escaped_chars(title);
648
+ if (num_attributes >= 4) {
649
+ write_chars(" by ");
650
+ write_escaped_chars(attributes[3]);
651
+ }
652
+ write_chars("\">");
653
+ counting_cap = TRUE;
654
+ write_escaped_chars(attributes[0]);
655
+ counting_cap = FALSE;
656
+ write_chars("</a>");
657
+ counting_cap = TRUE;
658
+ }
659
+ else {
660
+ counting_cap = FALSE;
661
+ write_chars("<a href=\"");
662
+ if (absolute_url) {
663
+ write_nchars(absolute_url, absolute_url_size);
664
+ }
665
+ write_chars("/search/search?q=");
666
+ write_escaped_chars(attributes[0]);
667
+ write_chars("\" title=\"");
668
+ write_chars(title);
669
+ write_chars("\">");
670
+ counting_cap = TRUE;
671
+ write_escaped_chars(attributes[0]);
672
+ counting_cap = FALSE;
673
+ write_chars("</a>");
674
+ counting_cap = TRUE;
675
+ }
676
+ }
677
+ else if(strcmp("bc", name) == 0){
678
+ if (num_attributes >= 5) {
679
+ counting_cap = FALSE;
680
+ write_chars("<a href=\"");
681
+ if (absolute_url) {
682
+ write_nchars(absolute_url, absolute_url_size);
683
+ }
684
+ write_chars("/book/show/");
685
+ write_chars(id);
686
+ write_char('.');
687
+ write_urlitized_chars(title);
688
+ write_chars("\"><img src=\"");
689
+ write_chars(attributes[4]);
690
+ write_chars("\" title=\"");
691
+ write_escaped_chars(title);
692
+ if (num_attributes >= 4) {
693
+ write_chars(" by ");
694
+ write_escaped_chars(attributes[3]);
695
+ }
696
+ write_chars("\" alt=\"");
697
+ write_escaped_chars(title);
698
+ write_chars("\"/></a>");
699
+ counting_cap = TRUE;
700
+ }
701
+ else {
702
+ write_chars("[bookcover:");
703
+ write_escaped_chars(attributes[0]);
704
+ write_char(']');
705
+ }
706
+ }
707
+ else if(strcmp("a", name) == 0 || strcmp("author", name) == 0){
708
+ if (id) {
709
+ counting_cap = FALSE;
710
+ write_chars("<a href=\"");
711
+ if (absolute_url) {
712
+ write_nchars(absolute_url, absolute_url_size);
713
+ }
714
+ write_chars("/author/show/");
715
+ write_chars(id);
716
+ write_char('.');
717
+ write_urlitized_chars(title);
718
+ write_chars("\" title=\"");
719
+ write_escaped_chars(title);
720
+ write_chars("\">");
721
+ counting_cap = TRUE;
722
+ write_escaped_chars(attributes[0]);
723
+ counting_cap = FALSE;
724
+ write_chars("</a>");
725
+ counting_cap = TRUE;
726
+ }
727
+ else {
728
+ counting_cap = FALSE;
729
+ write_chars("<a href=\"");
730
+ if (absolute_url) {
731
+ write_nchars(absolute_url, absolute_url_size);
732
+ }
733
+ write_chars("/search/search?q=");
734
+ write_escaped_chars(attributes[0]);
735
+ write_chars("\" title=\"");
736
+ write_chars(title);
737
+ write_chars("\">");
738
+ counting_cap = TRUE;
739
+ write_escaped_chars(attributes[0]);
740
+ counting_cap = FALSE;
741
+ write_chars("</a>");
742
+ counting_cap = TRUE;
743
+ }
744
+ }
745
+ else if(strcmp("ai", name) == 0){
746
+ if (num_attributes >= 4) {
747
+ counting_cap = FALSE;
748
+ write_chars("<a href=\"");
749
+ if (absolute_url) {
750
+ write_nchars(absolute_url, absolute_url_size);
751
+ }
752
+ write_chars("/author/show/");
753
+ write_chars(id);
754
+ write_char('.');
755
+ write_urlitized_chars(title);
756
+ write_chars("\"><img src=\"");
757
+ write_chars(attributes[3]);
758
+ write_chars("\" title=\"");
759
+ write_escaped_chars(title);
760
+ write_chars("\" alt=\"");
761
+ write_escaped_chars(title);
762
+ write_chars("\"/></a>");
763
+ counting_cap = TRUE;
764
+ }
765
+ else {
766
+ write_chars("[authorimage:");
767
+ write_escaped_chars(attributes[0]);
768
+ write_char(']');
769
+ }
770
+ }
771
+ else {
772
+ return FALSE;
773
+ }
774
+ return TRUE;
775
+ }
776
+
777
+ static int html_read() {
778
+ ahead_position = position;
779
+
780
+ while(ahead_position < input_size) {
781
+ switch(input[ahead_position++]) {
782
+ case '<':
783
+ return FALSE;
784
+ case '>':
785
+ return html_parse(position, ahead_position - 1);
786
+ }
787
+ }
788
+ return FALSE;
789
+ }
790
+
791
+ static int gr_tag_read() {
792
+ ahead_position = position;
793
+
794
+ // reading name
795
+ while(ahead_position < input_size) {
796
+ switch(input[ahead_position++]) {
797
+ case '[':
798
+ return FALSE;
799
+ case ']':
800
+ return gr_tag_parse();
801
+ }
802
+ }
803
+
804
+ return FALSE;
805
+ }
806
+
807
+ static int url_read() {
808
+ char *url;
809
+ char *url_downcase;
810
+ char c;
811
+ int i;
812
+ int url_size;
813
+ int has_http = FALSE;
814
+
815
+
816
+ if(anchors_in_stack > 0) {
817
+ return FALSE;
818
+ }
819
+
820
+ ahead_position = position;
821
+ url_size = 0;
822
+
823
+ while(ahead_position < input_size) {
824
+ int done = FALSE;
825
+ switch(input[ahead_position++]) {
826
+ case '"':
827
+ case '<':
828
+ case '>':
829
+ return FALSE;
830
+ case ' ':
831
+ case '\n':
832
+ case '\t':
833
+ case '(':
834
+ case ')':
835
+ ahead_position--;
836
+ done = TRUE;
837
+ break;
838
+ }
839
+ if(done) {
840
+ break;
841
+ }
842
+ }
843
+ url_size = 1 + ahead_position - position;
844
+
845
+ if (url_size < 5) {
846
+ return FALSE;
847
+ }
848
+
849
+ url = ALLOCA_N(char, url_size + 1);
850
+ strncpy(url, input + position - 1, url_size);
851
+ url[url_size] = 0;
852
+
853
+
854
+ url_downcase = ALLOCA_N(char, url_size + 1);
855
+ strncpy(url_downcase, url, url_size);
856
+ downcasen(url_downcase, url_size);
857
+
858
+
859
+ if (strncmp(url_downcase, "http://", 7) == 0) {
860
+ has_http = TRUE;
861
+ }
862
+ else if (strncmp(url_downcase, "https://", 8) == 0) {
863
+ has_http = TRUE;
864
+ }
865
+ else {
866
+ //try and decide if the its a url without 'http' in front
867
+ int has_www = FALSE;
868
+ int last_dot = -1;
869
+ int done = TRUE;
870
+ int tld_size;
871
+
872
+ //does it start with www.?
873
+ if (strncmp(url_downcase, "www.", 4) == 0) {
874
+ has_www = TRUE;
875
+ i = 4;
876
+ last_dot = 3;
877
+ }
878
+ else {
879
+ i = 0;
880
+ }
881
+
882
+ // see if it starts with a properly formed domain name
883
+ for(; i < url_size; i++) {
884
+ c = url_downcase[i];
885
+ if (c == '.') {
886
+ //starting with a period is invalid
887
+ if(i == 0) {
888
+ return FALSE;
889
+ }
890
+
891
+ //two periods in a row is invalid!
892
+ if(last_dot + 1 == i){
893
+ return FALSE;
894
+ }
895
+ last_dot = i;
896
+ }
897
+ else if (c == '/') {
898
+ // a slash means we're no longer reading a domain name
899
+ break;
900
+ }
901
+ else if ((c >= 'a' && c <= 'z') || c == '-' ||
902
+ (c >= '0' && c <= '9')) {
903
+ // valid domain name characters
904
+ }
905
+ else {
906
+ // domains must be made up of those other characters
907
+ return FALSE;
908
+ }
909
+ }
910
+
911
+ if(last_dot == -1) { // no periods were found
912
+ return FALSE;
913
+ }
914
+ tld_size = (i - last_dot) - 1;
915
+ if (has_www) {
916
+ if (tld_size < 2){
917
+ return FALSE;
918
+ }
919
+ }
920
+ else {
921
+ char *tld;
922
+ if (tld_size != 3){
923
+ return FALSE;
924
+ }
925
+ tld = url + last_dot + 1;
926
+ if(strncmp(tld, "com", 3) != 0 &&
927
+ strncmp(tld, "net", 3) != 0 &&
928
+ strncmp(tld, "org", 3) != 0 &&
929
+ strncmp(tld, "gov", 3) != 0){
930
+ // not a tld we autogenerate for!
931
+ return FALSE;
932
+ }
933
+ }
934
+
935
+ }
936
+
937
+ //OK, now its probably ok to generate the url
938
+ counting_cap = FALSE;
939
+ write_chars("<a rel=\"nofollow\" target=\"_blank\" href=\"");
940
+ if (!has_http) {
941
+ write_chars("http://");
942
+ }
943
+ write_chars(url);
944
+ if (url_size > MAX_URL_PRINT_SIZE) {
945
+ write_chars("\" title=\"");
946
+ write_chars(url);
947
+ }
948
+ write_chars("\">");
949
+ counting_cap = TRUE;
950
+ for(i = 0; i < MAX_URL_PRINT_SIZE && i < url_size; i++) {
951
+ switch(c = url[i]) {
952
+ case '&':
953
+ write_chars("&amp;");
954
+ break;
955
+ default:
956
+ write_char(c);
957
+ break;
958
+ }
959
+ }
960
+ if(i < url_size) {
961
+ write_chars("...");
962
+ }
963
+ counting_cap = FALSE;
964
+ write_chars("</a>");
965
+ counting_cap = TRUE;
966
+ return TRUE;
967
+ }
968
+
969
+
970
+ static int amp_read() {
971
+ int amp_escape_count = 0;
972
+ int poundsign = FALSE;
973
+ ahead_position = position;
974
+
975
+ // reading name
976
+ while(ahead_position < input_size) {
977
+ char c;
978
+ c = input[ahead_position++];
979
+ if (c == '#') {
980
+ if(amp_escape_count == 0) {
981
+ poundsign = TRUE;
982
+ }
983
+ else {
984
+ return FALSE;
985
+ }
986
+ }
987
+ else if(c >= '0' && c <= '9') {
988
+ }
989
+ else if((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')) {
990
+ if (poundsign) {
991
+ return FALSE;
992
+ }
993
+ }
994
+ else if (c == ';') {
995
+ if (amp_escape_count >= 2){
996
+ write_nchars(input + position - 1, amp_escape_count + 2);
997
+ return TRUE;
998
+ }
999
+ else {
1000
+ return FALSE;
1001
+ }
1002
+ }
1003
+ else {
1004
+ return FALSE;
1005
+ }
1006
+
1007
+ if(amp_escape_count++ > 6) {
1008
+ return FALSE;
1009
+ }
1010
+ }
1011
+
1012
+ return FALSE;
1013
+ }
1014
+
1015
+
1016
+ static VALUE t_parse(VALUE self, VALUE r_string, VALUE r_cap, VALUE r_cap_string) {
1017
+ char c;
1018
+ int try_url = TRUE;
1019
+ int cap_at;
1020
+ char * cap_string;
1021
+ VALUE result;
1022
+ tag_stack_size = 0;
1023
+ anchors_in_stack = 0;
1024
+ counting_cap = TRUE;
1025
+ cap_count = 0;
1026
+ writing_utf_8 = 0;
1027
+ position = ahead_position = 0;
1028
+
1029
+ //de-ruby all the inputs!
1030
+ r_string = StringValue(r_string);
1031
+ input_size = RSTRING(r_string)->len;
1032
+ input = RSTRING(r_string)->ptr;
1033
+
1034
+ if (NIL_P(r_cap)) {
1035
+ cap_at = 0;
1036
+ }
1037
+ else {
1038
+ cap_at = NUM2INT(r_cap);
1039
+ }
1040
+
1041
+ cap_string = StringValuePtr(r_cap_string);
1042
+
1043
+ max_output_size = input_size * 10 + 128;
1044
+ output = ALLOCA_N(char, max_output_size);
1045
+ output_size = 0;
1046
+
1047
+ tag_stack = ALLOCA_N(char *, MAX_STACK_DEPTH);
1048
+
1049
+ while(position < input_size) {
1050
+
1051
+ if(cap_at && cap_count >= cap_at) {
1052
+ write_chars(cap_string);
1053
+ break;
1054
+ }
1055
+ switch(c = input[position++]) {
1056
+ case '&':
1057
+ counting_cap = FALSE;
1058
+ if(amp_read()) {
1059
+ position = ahead_position;
1060
+ }
1061
+ else {
1062
+ write_chars("&amp;");
1063
+ }
1064
+ counting_cap = TRUE;
1065
+ try_url = FALSE;
1066
+ cap_count++;
1067
+ break;
1068
+ case '>':
1069
+ try_url = FALSE;
1070
+ write_chars("&gt;");
1071
+ break;
1072
+ case '<':
1073
+ counting_cap = FALSE;
1074
+ if(html_read()) {
1075
+ position = ahead_position;
1076
+ try_url = TRUE;
1077
+ counting_cap = TRUE;
1078
+ }
1079
+ else {
1080
+ counting_cap = TRUE;
1081
+ write_chars("&lt;");
1082
+ try_url = FALSE;
1083
+ }
1084
+ break;
1085
+ case '[':
1086
+ counting_cap = FALSE;
1087
+ if(gr_tag_read()) {
1088
+ position = ahead_position;
1089
+ try_url = TRUE;
1090
+ counting_cap = TRUE;
1091
+ }
1092
+ else {
1093
+ try_url = FALSE;
1094
+ counting_cap = TRUE;
1095
+ write_char('[');
1096
+ }
1097
+ break;
1098
+ case '"':
1099
+ try_url = FALSE;
1100
+ write_chars("&quot;");
1101
+ break;
1102
+ case '\n':
1103
+ write_chars("<br/>");
1104
+ try_url = TRUE;
1105
+ break;
1106
+ case ' ':
1107
+ case '\t':
1108
+ case '(':
1109
+ case ')':
1110
+ write_char(c);
1111
+ try_url = TRUE;
1112
+ break;
1113
+ default:
1114
+ if(try_url){
1115
+ if(url_read()) {
1116
+ position = ahead_position;
1117
+ }
1118
+ else {
1119
+ write_char(c);
1120
+ }
1121
+ try_url = FALSE;
1122
+ }
1123
+ else {
1124
+ write_char(c);
1125
+ }
1126
+ break;
1127
+ }
1128
+ }
1129
+ while(tag_stack_size > 0) {
1130
+ char *item = tag_stack[--tag_stack_size];
1131
+ write_chars("</");
1132
+ write_chars(item);
1133
+ free(item);
1134
+ write_char('>');
1135
+ }
1136
+ return rb_str_new(output, output_size);
1137
+ }
1138
+
1139
+
1140
+ static VALUE t_set_absolute_url(VALUE self, VALUE r_string) {
1141
+ int new_size;
1142
+ char *new_url;
1143
+ r_string = StringValue(r_string);
1144
+ absolute_url_size = RSTRING(r_string)->len;
1145
+ new_url = RSTRING(r_string)->ptr;
1146
+ if (absolute_url) {
1147
+ free(absolute_url);
1148
+ }
1149
+ absolute_url = malloc(absolute_url_size);
1150
+ strncpy(absolute_url, new_url, absolute_url_size);
1151
+ return r_string;
1152
+ }
1153
+
1154
+
1155
+ VALUE cTest;
1156
+
1157
+ void Init_gr_string_escape() {
1158
+ cTest = rb_define_class("GrStringEscape", rb_cObject);
1159
+ rb_define_method(cTest, "parse", t_parse, 3);
1160
+ rb_define_method(cTest, "set_absolute_url", t_set_absolute_url, 1);
1161
+ id_push = rb_intern("push");
1162
+ }