wikitext 1.0.3 → 1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/ext/parser.c +89 -31
- data/ext/parser.h +2 -0
- data/ext/token.c +1 -0
- data/ext/token.h +1 -0
- data/ext/wikitext.c +2 -0
- data/ext/wikitext_ragel.c +441 -525
- data/lib/wikitext/version.rb +1 -1
- data/spec/fulltext_spec.rb +91 -0
- data/spec/indentation_spec.rb +3 -3
- data/spec/tokenizing_spec.rb +3 -3
- data/spec/wikitext_spec.rb +6 -0
- metadata +3 -2
data/lib/wikitext/version.rb
CHANGED
@@ -0,0 +1,91 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# Copyright 2008 Wincent Colaiuta
|
3
|
+
# This program is free software: you can redistribute it and/or modify
|
4
|
+
# it under the terms of the GNU General Public License as published by
|
5
|
+
# the Free Software Foundation, either version 3 of the License, or
|
6
|
+
# (at your option) any later version.
|
7
|
+
#
|
8
|
+
# This program is distributed in the hope that it will be useful,
|
9
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
10
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
11
|
+
# GNU General Public License for more details.
|
12
|
+
#
|
13
|
+
# You should have received a copy of the GNU General Public License
|
14
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
15
|
+
|
16
|
+
require File.join(File.dirname(__FILE__), 'spec_helper.rb')
|
17
|
+
require 'wikitext'
|
18
|
+
|
19
|
+
describe Wikitext::Parser, 'fulltext tokenizing' do
|
20
|
+
before do
|
21
|
+
@parser = Wikitext::Parser.new
|
22
|
+
end
|
23
|
+
|
24
|
+
it 'should default to a minimum fulltext token length of 3' do
|
25
|
+
@parser.minimum_fulltext_token_length.should == 3
|
26
|
+
end
|
27
|
+
|
28
|
+
it 'should accept overrides of minimum fulltext token length at initialization time' do
|
29
|
+
parser = Wikitext::Parser.new(:minimum_fulltext_token_length => 10)
|
30
|
+
parser.minimum_fulltext_token_length.should == 10
|
31
|
+
parser.fulltext_tokenize('short loooooooooong').should == ['loooooooooong']
|
32
|
+
end
|
33
|
+
|
34
|
+
it 'should return nil for nil input' do
|
35
|
+
@parser.fulltext_tokenize(nil).should be_nil
|
36
|
+
end
|
37
|
+
|
38
|
+
it 'should return an empty array for empty string input' do
|
39
|
+
@parser.fulltext_tokenize('').should == []
|
40
|
+
end
|
41
|
+
|
42
|
+
it 'should return an empty array for an input string that contains nothing tokenizable' do
|
43
|
+
@parser.fulltext_tokenize('#!?()/&').should == []
|
44
|
+
end
|
45
|
+
|
46
|
+
it 'should tokenize simple words' do
|
47
|
+
@parser.fulltext_tokenize('foo bar baz').should == ['foo', 'bar', 'baz']
|
48
|
+
end
|
49
|
+
|
50
|
+
it 'should omit tokens shorter than the minimum required length' do
|
51
|
+
@parser.fulltext_tokenize('a b baz longer').should == ['baz', 'longer']
|
52
|
+
end
|
53
|
+
|
54
|
+
it 'should accept overrides of minimum length at parse time' do
|
55
|
+
@parser.fulltext_tokenize('a bc baz longer', :minimum => 2).should == ['bc', 'baz', 'longer']
|
56
|
+
end
|
57
|
+
|
58
|
+
it 'should treat a minimum length of 0 as meaning "no minimum length"' do
|
59
|
+
@parser.fulltext_tokenize('a bc baz longer', :minimum => 0).should == ['a', 'bc', 'baz', 'longer']
|
60
|
+
end
|
61
|
+
|
62
|
+
it 'should interpret a minimum length of nil as meaning "default minumum length" (3)' do
|
63
|
+
@parser.minimum_fulltext_token_length = 10
|
64
|
+
@parser.fulltext_tokenize('a bc baz longer', :minimum => nil).should == ['baz', 'longer']
|
65
|
+
end
|
66
|
+
|
67
|
+
it 'should tokenize URLs' do
|
68
|
+
@parser.fulltext_tokenize('foo http://example.com/ bar').should == ['foo', 'http://example.com/', 'bar']
|
69
|
+
end
|
70
|
+
|
71
|
+
it 'should tokenize email addresses' do
|
72
|
+
@parser.fulltext_tokenize('foo user@example.com bar').should == ['foo', 'user@example.com', 'bar']
|
73
|
+
end
|
74
|
+
|
75
|
+
it 'should ignore punctuation' do
|
76
|
+
@parser.fulltext_tokenize("don't forget!").should == ['don', 'forget']
|
77
|
+
end
|
78
|
+
|
79
|
+
it 'should ignore non-ASCII' do
|
80
|
+
# note that a search for "información lingüística" will still work, but might return some false positives
|
81
|
+
@parser.fulltext_tokenize('buscando información lingüística').should == ['buscando', 'informaci', 'ling', 'stica']
|
82
|
+
end
|
83
|
+
|
84
|
+
it 'should ignore wikitext markup' do
|
85
|
+
@parser.fulltext_tokenize("this <nowiki>that</nowiki> [[foo bar]]").should == ['this', 'that', 'foo', 'bar']
|
86
|
+
end
|
87
|
+
|
88
|
+
it 'should tokenize alphanumerics' do
|
89
|
+
@parser.fulltext_tokenize("password99 2008").should == ['password99', '2008']
|
90
|
+
end
|
91
|
+
end
|
data/spec/indentation_spec.rb
CHANGED
@@ -47,12 +47,12 @@ describe Wikitext::Parser, 'indentation' do
|
|
47
47
|
end
|
48
48
|
|
49
49
|
it 'should complain if the "indent" option is nil' do
|
50
|
-
lambda { @parser.parse('* foo', :
|
50
|
+
lambda { @parser.parse('* foo', :indent => nil) }.should raise_error(TypeError)
|
51
51
|
end
|
52
52
|
|
53
53
|
it 'should complain if the "indent" options is not an integer' do
|
54
|
-
lambda { @parser.parse('* foo', :
|
55
|
-
lambda { @parser.parse('* foo', :
|
54
|
+
lambda { @parser.parse('* foo', :indent => 'bar') }.should raise_error(TypeError)
|
55
|
+
lambda { @parser.parse('* foo', :indent => /baz/) }.should raise_error(TypeError)
|
56
56
|
end
|
57
57
|
|
58
58
|
it 'should treat a negative "indent" as though it were zero' do
|
data/spec/tokenizing_spec.rb
CHANGED
@@ -56,7 +56,7 @@ describe Wikitext::Parser, 'tokenizing' do
|
|
56
56
|
it 'should tokenize strings containing a single symbol' do
|
57
57
|
@tokens = @parser.tokenize('foo')
|
58
58
|
@tokens.length.should == 2
|
59
|
-
@tokens[0].token_type.should == :
|
59
|
+
@tokens[0].token_type.should == :alnum
|
60
60
|
@tokens[0].string_value.should == 'foo'
|
61
61
|
@tokens[1].token_type.should == :end_of_file
|
62
62
|
@tokens[1].string_value.should == ''
|
@@ -65,7 +65,7 @@ describe Wikitext::Parser, 'tokenizing' do
|
|
65
65
|
it 'should tokenize strings containing multiple symbols' do
|
66
66
|
@tokens = @parser.tokenize('foo http://example.com/')
|
67
67
|
@tokens.length.should == 4
|
68
|
-
@tokens[0].token_type.should == :
|
68
|
+
@tokens[0].token_type.should == :alnum
|
69
69
|
@tokens[0].string_value.should == 'foo'
|
70
70
|
@tokens[1].token_type.should == :space
|
71
71
|
@tokens[1].string_value.should == ' '
|
@@ -78,7 +78,7 @@ describe Wikitext::Parser, 'tokenizing' do
|
|
78
78
|
it 'should tokenize runs of printable characters as as single symbol' do
|
79
79
|
@tokens = @parser.tokenize('foo')
|
80
80
|
@tokens.length.should == 2
|
81
|
-
@tokens[0].token_type.should == :
|
81
|
+
@tokens[0].token_type.should == :alnum
|
82
82
|
@tokens[0].string_value.should == 'foo'
|
83
83
|
@tokens[0].line_start.should == 1
|
84
84
|
@tokens[0].column_start.should == 1
|
data/spec/wikitext_spec.rb
CHANGED
@@ -88,6 +88,12 @@ describe Wikitext::Parser do
|
|
88
88
|
Wikitext::Parser.new(:treat_slash_as_special => false).treat_slash_as_special.should == false
|
89
89
|
end
|
90
90
|
end
|
91
|
+
|
92
|
+
describe 'overriding defaults at parse time' do
|
93
|
+
it 'should ignore unknown options' do
|
94
|
+
@parser.parse('foo', :bar => 'baz').should == "<p>foo</p>\n"
|
95
|
+
end
|
96
|
+
end
|
91
97
|
end
|
92
98
|
|
93
99
|
describe Wikitext::Parser, 'parsing non-ASCII input' do
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wikitext
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: "1.1"
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Wincent Colaiuta
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2008-04-
|
12
|
+
date: 2008-04-25 00:00:00 +02:00
|
13
13
|
default_executable:
|
14
14
|
dependencies: []
|
15
15
|
|
@@ -28,6 +28,7 @@ files:
|
|
28
28
|
- spec/encoding_spec.rb
|
29
29
|
- spec/entity_spec.rb
|
30
30
|
- spec/external_link_spec.rb
|
31
|
+
- spec/fulltext_spec.rb
|
31
32
|
- spec/h1_spec.rb
|
32
33
|
- spec/h2_spec.rb
|
33
34
|
- spec/h3_spec.rb
|