wikitext 1.0.3 → 1.1
Sign up to get free protection for your applications and to get access to all the features.
- data/ext/parser.c +89 -31
- data/ext/parser.h +2 -0
- data/ext/token.c +1 -0
- data/ext/token.h +1 -0
- data/ext/wikitext.c +2 -0
- data/ext/wikitext_ragel.c +441 -525
- data/lib/wikitext/version.rb +1 -1
- data/spec/fulltext_spec.rb +91 -0
- data/spec/indentation_spec.rb +3 -3
- data/spec/tokenizing_spec.rb +3 -3
- data/spec/wikitext_spec.rb +6 -0
- metadata +3 -2
data/lib/wikitext/version.rb
CHANGED
@@ -0,0 +1,91 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# Copyright 2008 Wincent Colaiuta
|
3
|
+
# This program is free software: you can redistribute it and/or modify
|
4
|
+
# it under the terms of the GNU General Public License as published by
|
5
|
+
# the Free Software Foundation, either version 3 of the License, or
|
6
|
+
# (at your option) any later version.
|
7
|
+
#
|
8
|
+
# This program is distributed in the hope that it will be useful,
|
9
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
10
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
11
|
+
# GNU General Public License for more details.
|
12
|
+
#
|
13
|
+
# You should have received a copy of the GNU General Public License
|
14
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
15
|
+
|
16
|
+
require File.join(File.dirname(__FILE__), 'spec_helper.rb')
|
17
|
+
require 'wikitext'
|
18
|
+
|
19
|
+
describe Wikitext::Parser, 'fulltext tokenizing' do
|
20
|
+
before do
|
21
|
+
@parser = Wikitext::Parser.new
|
22
|
+
end
|
23
|
+
|
24
|
+
it 'should default to a minimum fulltext token length of 3' do
|
25
|
+
@parser.minimum_fulltext_token_length.should == 3
|
26
|
+
end
|
27
|
+
|
28
|
+
it 'should accept overrides of minimum fulltext token length at initialization time' do
|
29
|
+
parser = Wikitext::Parser.new(:minimum_fulltext_token_length => 10)
|
30
|
+
parser.minimum_fulltext_token_length.should == 10
|
31
|
+
parser.fulltext_tokenize('short loooooooooong').should == ['loooooooooong']
|
32
|
+
end
|
33
|
+
|
34
|
+
it 'should return nil for nil input' do
|
35
|
+
@parser.fulltext_tokenize(nil).should be_nil
|
36
|
+
end
|
37
|
+
|
38
|
+
it 'should return an empty array for empty string input' do
|
39
|
+
@parser.fulltext_tokenize('').should == []
|
40
|
+
end
|
41
|
+
|
42
|
+
it 'should return an empty array for an input string that contains nothing tokenizable' do
|
43
|
+
@parser.fulltext_tokenize('#!?()/&').should == []
|
44
|
+
end
|
45
|
+
|
46
|
+
it 'should tokenize simple words' do
|
47
|
+
@parser.fulltext_tokenize('foo bar baz').should == ['foo', 'bar', 'baz']
|
48
|
+
end
|
49
|
+
|
50
|
+
it 'should omit tokens shorter than the minimum required length' do
|
51
|
+
@parser.fulltext_tokenize('a b baz longer').should == ['baz', 'longer']
|
52
|
+
end
|
53
|
+
|
54
|
+
it 'should accept overrides of minimum length at parse time' do
|
55
|
+
@parser.fulltext_tokenize('a bc baz longer', :minimum => 2).should == ['bc', 'baz', 'longer']
|
56
|
+
end
|
57
|
+
|
58
|
+
it 'should treat a minimum length of 0 as meaning "no minimum length"' do
|
59
|
+
@parser.fulltext_tokenize('a bc baz longer', :minimum => 0).should == ['a', 'bc', 'baz', 'longer']
|
60
|
+
end
|
61
|
+
|
62
|
+
it 'should interpret a minimum length of nil as meaning "default minumum length" (3)' do
|
63
|
+
@parser.minimum_fulltext_token_length = 10
|
64
|
+
@parser.fulltext_tokenize('a bc baz longer', :minimum => nil).should == ['baz', 'longer']
|
65
|
+
end
|
66
|
+
|
67
|
+
it 'should tokenize URLs' do
|
68
|
+
@parser.fulltext_tokenize('foo http://example.com/ bar').should == ['foo', 'http://example.com/', 'bar']
|
69
|
+
end
|
70
|
+
|
71
|
+
it 'should tokenize email addresses' do
|
72
|
+
@parser.fulltext_tokenize('foo user@example.com bar').should == ['foo', 'user@example.com', 'bar']
|
73
|
+
end
|
74
|
+
|
75
|
+
it 'should ignore punctuation' do
|
76
|
+
@parser.fulltext_tokenize("don't forget!").should == ['don', 'forget']
|
77
|
+
end
|
78
|
+
|
79
|
+
it 'should ignore non-ASCII' do
|
80
|
+
# note that a search for "información lingüística" will still work, but might return some false positives
|
81
|
+
@parser.fulltext_tokenize('buscando información lingüística').should == ['buscando', 'informaci', 'ling', 'stica']
|
82
|
+
end
|
83
|
+
|
84
|
+
it 'should ignore wikitext markup' do
|
85
|
+
@parser.fulltext_tokenize("this <nowiki>that</nowiki> [[foo bar]]").should == ['this', 'that', 'foo', 'bar']
|
86
|
+
end
|
87
|
+
|
88
|
+
it 'should tokenize alphanumerics' do
|
89
|
+
@parser.fulltext_tokenize("password99 2008").should == ['password99', '2008']
|
90
|
+
end
|
91
|
+
end
|
data/spec/indentation_spec.rb
CHANGED
@@ -47,12 +47,12 @@ describe Wikitext::Parser, 'indentation' do
|
|
47
47
|
end
|
48
48
|
|
49
49
|
it 'should complain if the "indent" option is nil' do
|
50
|
-
lambda { @parser.parse('* foo', :
|
50
|
+
lambda { @parser.parse('* foo', :indent => nil) }.should raise_error(TypeError)
|
51
51
|
end
|
52
52
|
|
53
53
|
it 'should complain if the "indent" options is not an integer' do
|
54
|
-
lambda { @parser.parse('* foo', :
|
55
|
-
lambda { @parser.parse('* foo', :
|
54
|
+
lambda { @parser.parse('* foo', :indent => 'bar') }.should raise_error(TypeError)
|
55
|
+
lambda { @parser.parse('* foo', :indent => /baz/) }.should raise_error(TypeError)
|
56
56
|
end
|
57
57
|
|
58
58
|
it 'should treat a negative "indent" as though it were zero' do
|
data/spec/tokenizing_spec.rb
CHANGED
@@ -56,7 +56,7 @@ describe Wikitext::Parser, 'tokenizing' do
|
|
56
56
|
it 'should tokenize strings containing a single symbol' do
|
57
57
|
@tokens = @parser.tokenize('foo')
|
58
58
|
@tokens.length.should == 2
|
59
|
-
@tokens[0].token_type.should == :
|
59
|
+
@tokens[0].token_type.should == :alnum
|
60
60
|
@tokens[0].string_value.should == 'foo'
|
61
61
|
@tokens[1].token_type.should == :end_of_file
|
62
62
|
@tokens[1].string_value.should == ''
|
@@ -65,7 +65,7 @@ describe Wikitext::Parser, 'tokenizing' do
|
|
65
65
|
it 'should tokenize strings containing multiple symbols' do
|
66
66
|
@tokens = @parser.tokenize('foo http://example.com/')
|
67
67
|
@tokens.length.should == 4
|
68
|
-
@tokens[0].token_type.should == :
|
68
|
+
@tokens[0].token_type.should == :alnum
|
69
69
|
@tokens[0].string_value.should == 'foo'
|
70
70
|
@tokens[1].token_type.should == :space
|
71
71
|
@tokens[1].string_value.should == ' '
|
@@ -78,7 +78,7 @@ describe Wikitext::Parser, 'tokenizing' do
|
|
78
78
|
it 'should tokenize runs of printable characters as as single symbol' do
|
79
79
|
@tokens = @parser.tokenize('foo')
|
80
80
|
@tokens.length.should == 2
|
81
|
-
@tokens[0].token_type.should == :
|
81
|
+
@tokens[0].token_type.should == :alnum
|
82
82
|
@tokens[0].string_value.should == 'foo'
|
83
83
|
@tokens[0].line_start.should == 1
|
84
84
|
@tokens[0].column_start.should == 1
|
data/spec/wikitext_spec.rb
CHANGED
@@ -88,6 +88,12 @@ describe Wikitext::Parser do
|
|
88
88
|
Wikitext::Parser.new(:treat_slash_as_special => false).treat_slash_as_special.should == false
|
89
89
|
end
|
90
90
|
end
|
91
|
+
|
92
|
+
describe 'overriding defaults at parse time' do
|
93
|
+
it 'should ignore unknown options' do
|
94
|
+
@parser.parse('foo', :bar => 'baz').should == "<p>foo</p>\n"
|
95
|
+
end
|
96
|
+
end
|
91
97
|
end
|
92
98
|
|
93
99
|
describe Wikitext::Parser, 'parsing non-ASCII input' do
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wikitext
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: "1.1"
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Wincent Colaiuta
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2008-04-
|
12
|
+
date: 2008-04-25 00:00:00 +02:00
|
13
13
|
default_executable:
|
14
14
|
dependencies: []
|
15
15
|
|
@@ -28,6 +28,7 @@ files:
|
|
28
28
|
- spec/encoding_spec.rb
|
29
29
|
- spec/entity_spec.rb
|
30
30
|
- spec/external_link_spec.rb
|
31
|
+
- spec/fulltext_spec.rb
|
31
32
|
- spec/h1_spec.rb
|
32
33
|
- spec/h2_spec.rb
|
33
34
|
- spec/h3_spec.rb
|