tweetparser 0.1.0 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/tweetparser.rb +12 -0
- data/lib/tweetparser/grammar.treetop +37 -9
- data/test/conformance_test.rb +48 -0
- data/test/parser_test.rb +30 -9
- data/test/twitter-text-conformance/README +6 -0
- data/test/twitter-text-conformance/autolink.yml +250 -0
- data/test/twitter-text-conformance/extract.yml +193 -0
- metadata +6 -2
data/lib/tweetparser.rb
CHANGED
@@ -1,3 +1,15 @@
|
|
1
1
|
require "treetop"
|
2
2
|
require "polyglot"
|
3
3
|
require "tweetparser/grammar"
|
4
|
+
|
5
|
+
module TweetParser
|
6
|
+
def self.parse(input)
|
7
|
+
kcode = $KCODE
|
8
|
+
$KCODE = "n"
|
9
|
+
parser = TweetContentParser.new
|
10
|
+
parsed = parser.parse(input)
|
11
|
+
$KCODE = kcode
|
12
|
+
return nil unless parsed
|
13
|
+
parsed.content
|
14
|
+
end
|
15
|
+
end
|
@@ -1,38 +1,66 @@
|
|
1
1
|
grammar TweetContent
|
2
2
|
rule tweet
|
3
|
-
(url / html / space / newline /
|
3
|
+
(url / html / space / newline / list / username / hashtag / slash / text)* {
|
4
4
|
def content
|
5
5
|
elements.map{ |e| e.content }
|
6
6
|
end
|
7
7
|
}
|
8
8
|
end
|
9
9
|
|
10
|
+
rule subdomain
|
11
|
+
([a-zA-Z0-9\-] / [^\x20-\x7F])+
|
12
|
+
end
|
13
|
+
|
10
14
|
rule url
|
11
|
-
"http" "
|
15
|
+
(("http" / "HTTP") [sS]? "://" / "www." / "WWW.") subdomain ("." subdomain)+ ("/" [\.a-zA-Z0-9\?#=\-_&%]*)* {
|
12
16
|
def content
|
13
17
|
[:url, text_value]
|
14
18
|
end
|
15
19
|
}
|
16
20
|
end
|
17
21
|
|
18
|
-
rule
|
19
|
-
|
22
|
+
rule name
|
23
|
+
[a-zA-Z0-9_]+
|
24
|
+
end
|
25
|
+
|
26
|
+
rule name_with_letters
|
27
|
+
[a-zA-Z] [a-zA-Z0-9_]* / [0-9_]+ [a-zA-Z] [a-zA-Z0-9_]*
|
28
|
+
end
|
29
|
+
|
30
|
+
rule username
|
31
|
+
("@" / "@") name {
|
32
|
+
def content
|
33
|
+
[:username, text_value]
|
34
|
+
end
|
35
|
+
}
|
36
|
+
end
|
37
|
+
|
38
|
+
rule list
|
39
|
+
username "/" name {
|
20
40
|
def content
|
21
|
-
[:
|
41
|
+
[:list, text_value]
|
22
42
|
end
|
23
43
|
}
|
24
44
|
end
|
25
45
|
|
26
46
|
rule hashtag
|
27
|
-
"#"
|
47
|
+
("#" / "#") name_with_letters {
|
28
48
|
def content
|
29
49
|
[:hashtag, text_value]
|
30
50
|
end
|
31
51
|
}
|
32
52
|
end
|
33
53
|
|
54
|
+
rule slash
|
55
|
+
"/" name {
|
56
|
+
def content
|
57
|
+
[:slash, text_value]
|
58
|
+
end
|
59
|
+
}
|
60
|
+
end
|
61
|
+
|
34
62
|
rule text
|
35
|
-
|
63
|
+
[\S]+ {
|
36
64
|
def content
|
37
65
|
[:text, text_value]
|
38
66
|
end
|
@@ -50,13 +78,13 @@ grammar TweetContent
|
|
50
78
|
rule newline
|
51
79
|
"\r"? "\n" {
|
52
80
|
def content
|
53
|
-
[:newline]
|
81
|
+
[:newline, text_value]
|
54
82
|
end
|
55
83
|
}
|
56
84
|
end
|
57
85
|
|
58
86
|
rule space
|
59
|
-
" "+ {
|
87
|
+
(" " / " ")+ {
|
60
88
|
def content
|
61
89
|
[:space, text_value]
|
62
90
|
end
|
@@ -0,0 +1,48 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
$:.unshift(File.expand_path("../../lib", __FILE__))
|
3
|
+
require "test/unit"
|
4
|
+
require "shoulda"
|
5
|
+
require "tweetparser"
|
6
|
+
require "yaml"
|
7
|
+
require "cgi"
|
8
|
+
|
9
|
+
class AutolinkConformanceTest < Test::Unit::TestCase
|
10
|
+
DATA_PATH = File.expand_path("../twitter-text-conformance/autolink.yml", __FILE__)
|
11
|
+
|
12
|
+
def assert_autolink(expected, input)
|
13
|
+
sexpr = TweetParser.parse(input)
|
14
|
+
assert sexpr, "Failed to parse #{input}"
|
15
|
+
actual = sexpr.inject(""){ |output, (type, value)|
|
16
|
+
output <<
|
17
|
+
case type
|
18
|
+
when :username
|
19
|
+
at, username = value.scan(/^.|.*/)
|
20
|
+
%{#{at}<a class="tweet-url username" href="http://twitter.com/#{username}">#{username}</a>}
|
21
|
+
when :list
|
22
|
+
at, list = value.scan(/^.|.*/)
|
23
|
+
%{#{at}<a class="tweet-url list-slug" href=\"http://twitter.com/#{list}">#{list}</a>}
|
24
|
+
when :hashtag
|
25
|
+
hash, hashtag = value.scan(/^.|.*/)
|
26
|
+
%{<a href="http://twitter.com/search?q=%23#{hashtag}" }+
|
27
|
+
%{title="\##{hashtag}" class="tweet-url hashtag">#{value}</a>}
|
28
|
+
when :url
|
29
|
+
href = value
|
30
|
+
href = "http://" + value unless href =~ /^http/i
|
31
|
+
%{<a href="#{href}">#{value}</a>}
|
32
|
+
else
|
33
|
+
value
|
34
|
+
end
|
35
|
+
}
|
36
|
+
assert_equal expected, actual, sexpr.inspect
|
37
|
+
end
|
38
|
+
|
39
|
+
YAML.load(File.read(DATA_PATH))["tests"].each do |section, tests|
|
40
|
+
context "when testing #{section}" do
|
41
|
+
tests.each do |hash|
|
42
|
+
should hash["description"] do
|
43
|
+
assert_autolink hash["expected"], hash["text"]
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
data/test/parser_test.rb
CHANGED
@@ -6,12 +6,8 @@ require "tweetparser"
|
|
6
6
|
|
7
7
|
class ParserTest < Test::Unit::TestCase
|
8
8
|
|
9
|
-
def setup
|
10
|
-
@parser = TweetContentParser.new
|
11
|
-
end
|
12
|
-
|
13
9
|
def assert_parses(expected, input)
|
14
|
-
actual =
|
10
|
+
actual = TweetParser.parse(input)
|
15
11
|
assert_equal expected, actual
|
16
12
|
end
|
17
13
|
|
@@ -24,6 +20,21 @@ class ParserTest < Test::Unit::TestCase
|
|
24
20
|
assert_parses [[:url, s]], s
|
25
21
|
end
|
26
22
|
|
23
|
+
should "extract url with www and no http" do
|
24
|
+
s = "www.example.com/mail/?ui=2&shva=1#inbox"
|
25
|
+
assert_parses [[:url, s]], s
|
26
|
+
end
|
27
|
+
|
28
|
+
should "extract IDN url" do
|
29
|
+
s = "http://✪df.ws/ejp"
|
30
|
+
assert_parses [[:url, s]], s
|
31
|
+
end
|
32
|
+
|
33
|
+
should "not extract invalid domain" do
|
34
|
+
s = "http://example_com/mail/?ui=2&shva=1#inbox"
|
35
|
+
assert_parses [[:text, s]], s
|
36
|
+
end
|
37
|
+
|
27
38
|
should "extract hashtag" do
|
28
39
|
s = "#HashTag2010"
|
29
40
|
assert_parses [[:hashtag, s]], s
|
@@ -31,7 +42,7 @@ class ParserTest < Test::Unit::TestCase
|
|
31
42
|
|
32
43
|
should "extract at-references" do
|
33
44
|
s = "@AtRef_3000"
|
34
|
-
assert_parses [[:
|
45
|
+
assert_parses [[:username, s]], s
|
35
46
|
end
|
36
47
|
|
37
48
|
should "extract HTML" do
|
@@ -39,9 +50,19 @@ class ParserTest < Test::Unit::TestCase
|
|
39
50
|
assert_parses [[:html, s]], s
|
40
51
|
end
|
41
52
|
|
53
|
+
should "extract a slash comment" do
|
54
|
+
s = %{/via}
|
55
|
+
assert_parses [[:slash, s]], s
|
56
|
+
end
|
57
|
+
|
58
|
+
should "extract a list" do
|
59
|
+
s = %{@username/list}
|
60
|
+
assert_parses [[:list, s]], s
|
61
|
+
end
|
62
|
+
|
42
63
|
should "extract words spaces and new lines" do
|
43
64
|
s = "this string\nhas spaces!"
|
44
|
-
expected = [[:text, "this"], [:space, " "], [:text, "string"], [:newline],
|
65
|
+
expected = [[:text, "this"], [:space, " "], [:text, "string"], [:newline, "\n"],
|
45
66
|
[:text, "has"], [:space, " "], [:text, "spaces!"]]
|
46
67
|
assert_parses expected, s
|
47
68
|
end
|
@@ -51,7 +72,7 @@ class ParserTest < Test::Unit::TestCase
|
|
51
72
|
expected = [[:text, "Another"], [:space, " "], [:text, "test:"], [:space, " "],
|
52
73
|
[:html, "<a href=\"http://twitpic.com/14vzny\" target=\"_blank\">"],
|
53
74
|
[:html, "<img src=\"http://twitpic.com/show/mini/14vzny\" />"],
|
54
|
-
[:html, "</a>"], [:newline],
|
75
|
+
[:html, "</a>"], [:newline, "\n"],
|
55
76
|
[:url, "http://twitpic.com/14vzny"],
|
56
77
|
[:space, " "], [:text, "3"], [:space, " "],
|
57
78
|
[:url, "http://twitpic.com/14vzny"]]
|
@@ -60,7 +81,7 @@ class ParserTest < Test::Unit::TestCase
|
|
60
81
|
|
61
82
|
should "extract elements from real-world sample" do
|
62
83
|
s = %{RT @newsbrooke Tonight’s the night!: Hope you’ll all tune in tonight to watch On Expenses at 9pm on BBC4 http://bit.ly/cgbkmF #mps #uk}
|
63
|
-
expected = [[:text, "RT"], [:space, " "], [:
|
84
|
+
expected = [[:text, "RT"], [:space, " "], [:username, "@newsbrooke"], [:space, " "],
|
64
85
|
[:text, "Tonight’s"], [:space, " "], [:text, "the"], [:space, " "],
|
65
86
|
[:text, "night!:"], [:space, " "], [:text, "Hope"], [:space, " "],
|
66
87
|
[:text, "you’ll"], [:space, " "], [:text, "all"], [:space, " "],
|
@@ -0,0 +1,250 @@
|
|
1
|
+
|
2
|
+
tests:
|
3
|
+
usernames:
|
4
|
+
- description: "Autolink trailing username"
|
5
|
+
text: "text @username"
|
6
|
+
expected: "text @<a class=\"tweet-url username\" href=\"http://twitter.com/username\">username</a>"
|
7
|
+
|
8
|
+
- description: "Autolink username at the beginning"
|
9
|
+
text: "@username text"
|
10
|
+
expected: "@<a class=\"tweet-url username\" href=\"http://twitter.com/username\">username</a> text"
|
11
|
+
|
12
|
+
- description: "DO NOT Autolink username preceded by a letter"
|
13
|
+
text: "meet@the beach"
|
14
|
+
expected: "meet@the beach"
|
15
|
+
|
16
|
+
- description: "Autolink username preceded by puctuation"
|
17
|
+
text: "great.@username"
|
18
|
+
expected: "great.@<a class=\"tweet-url username\" href=\"http://twitter.com/username\">username</a>"
|
19
|
+
|
20
|
+
- description: "Autolink username followed by puctuation"
|
21
|
+
text: "@username&^$%^"
|
22
|
+
expected: "@<a class=\"tweet-url username\" href=\"http://twitter.com/username\">username</a>&^$%^"
|
23
|
+
|
24
|
+
- description: "Autolink username followed by Japanese"
|
25
|
+
text: "@usernameの"
|
26
|
+
expected: "@<a class=\"tweet-url username\" href=\"http://twitter.com/username\">username</a>の"
|
27
|
+
|
28
|
+
- description: "Autolink username preceded by Japanese"
|
29
|
+
text: "あ@username"
|
30
|
+
expected: "あ@<a class=\"tweet-url username\" href=\"http://twitter.com/username\">username</a>"
|
31
|
+
|
32
|
+
- description: "Autolink username surrounded by Japanese"
|
33
|
+
text: "あ@usernameの"
|
34
|
+
expected: "あ@<a class=\"tweet-url username\" href=\"http://twitter.com/username\">username</a>の"
|
35
|
+
|
36
|
+
- description: "Autolink username with full-width at sign (U+FF20)"
|
37
|
+
text: "@username"
|
38
|
+
expected: "@<a class=\"tweet-url username\" href=\"http://twitter.com/username\">username</a>"
|
39
|
+
|
40
|
+
- description: "DO NOT Autolink username over 20 characters"
|
41
|
+
text: "@username9012345678901"
|
42
|
+
expected: "@<a class=\"tweet-url username\" href=\"http://twitter.com/username901234567890\">username901234567890</a>1"
|
43
|
+
lists:
|
44
|
+
- description: "Autolink list preceded by a space"
|
45
|
+
text: "text @username/list"
|
46
|
+
expected: "text @<a class=\"tweet-url list-slug\" href=\"http://twitter.com/username/list\">username/list</a>"
|
47
|
+
|
48
|
+
- description: "DO NOT Autolink list when space follows slash"
|
49
|
+
text: "text @username/ list"
|
50
|
+
expected: "text @<a class=\"tweet-url username\" href=\"http://twitter.com/username\">username</a>/ list"
|
51
|
+
|
52
|
+
- description: "DO NOT Autolink list with empty username"
|
53
|
+
text: "text @/list"
|
54
|
+
expected: "text @/list"
|
55
|
+
|
56
|
+
- description: "Autolink list at the beginning"
|
57
|
+
text: "@username/list"
|
58
|
+
expected: "@<a class=\"tweet-url list-slug\" href=\"http://twitter.com/username/list\">username/list</a>"
|
59
|
+
|
60
|
+
- description: "DO NOT Autolink list preceded by letter"
|
61
|
+
text: "meet@the/beach"
|
62
|
+
expected: "meet@the/beach"
|
63
|
+
|
64
|
+
- description: "Autolink list preceded by puctuation"
|
65
|
+
text: "great.@username/list"
|
66
|
+
expected: "great.@<a class=\"tweet-url list-slug\" href=\"http://twitter.com/username/list\">username/list</a>"
|
67
|
+
|
68
|
+
- description: "Autolink list followed by puctuation"
|
69
|
+
text: "@username/list&^$%^"
|
70
|
+
expected: "@<a class=\"tweet-url list-slug\" href=\"http://twitter.com/username/list\">username/list</a>&^$%^"
|
71
|
+
|
72
|
+
- description: "Autolink list name over 80 characters (truncated to 80)"
|
73
|
+
text: "@username/list5678901234567890123456789012345678901234567890123456789012345678901234567890A"
|
74
|
+
expected: "@<a class=\"tweet-url list-slug\" href=\"http://twitter.com/username/list5678901234567890123456789012345678901234567890123456789012345678901234567890\">username/list5678901234567890123456789012345678901234567890123456789012345678901234567890</a>A"
|
75
|
+
|
76
|
+
hashtags:
|
77
|
+
- description: "Autolink trailing hashtag"
|
78
|
+
text: "text #hashtag"
|
79
|
+
expected: "text <a href=\"http://twitter.com/search?q=%23hashtag\" title=\"#hashtag\" class=\"tweet-url hashtag\">#hashtag</a>"
|
80
|
+
|
81
|
+
- description: "Autolink alphanumeric hashtag (letter-number-letter)"
|
82
|
+
text: "text #hash0tag"
|
83
|
+
expected: "text <a href=\"http://twitter.com/search?q=%23hash0tag\" title=\"#hash0tag\" class=\"tweet-url hashtag\">#hash0tag</a>"
|
84
|
+
|
85
|
+
- description: "Autolink alphanumeric hashtag (number-letter)"
|
86
|
+
text: "text #1tag"
|
87
|
+
expected: "text <a href=\"http://twitter.com/search?q=%231tag\" title=\"#1tag\" class=\"tweet-url hashtag\">#1tag</a>"
|
88
|
+
|
89
|
+
- description: "Autolink hashtag with underscore"
|
90
|
+
text: "text #hash_tag"
|
91
|
+
expected: "text <a href=\"http://twitter.com/search?q=%23hash_tag\" title=\"#hash_tag\" class=\"tweet-url hashtag\">#hash_tag</a>"
|
92
|
+
|
93
|
+
- description: "DO NOT Autolink all-numeric hashtags"
|
94
|
+
text: "text #1234"
|
95
|
+
expected: "text #1234"
|
96
|
+
|
97
|
+
- description: "DO NOT Autolink hashtag preceded by a letter"
|
98
|
+
text: "text#hashtag"
|
99
|
+
expected: "text#hashtag"
|
100
|
+
|
101
|
+
- description: "Autolink multiple hashtags"
|
102
|
+
text: "text #hashtag1 #hashtag2"
|
103
|
+
expected: "text <a href=\"http://twitter.com/search?q=%23hashtag1\" title=\"#hashtag1\" class=\"tweet-url hashtag\">#hashtag1</a> <a href=\"http://twitter.com/search?q=%23hashtag2\" title=\"#hashtag2\" class=\"tweet-url hashtag\">#hashtag2</a>"
|
104
|
+
|
105
|
+
- description: "Autolink hashtag preceded by a period"
|
106
|
+
text: "text.#hashtag"
|
107
|
+
expected: "text.<a href=\"http://twitter.com/search?q=%23hashtag\" title=\"#hashtag\" class=\"tweet-url hashtag\">#hashtag</a>"
|
108
|
+
|
109
|
+
- description: "DO NOT Autolink hashtag preceded by &"
|
110
|
+
text: "&#nbsp;"
|
111
|
+
expected: "&#nbsp;"
|
112
|
+
|
113
|
+
- description: "Autolink hashtag followed by ! (! not included)"
|
114
|
+
text: "text #hashtag!"
|
115
|
+
expected: "text <a href=\"http://twitter.com/search?q=%23hashtag\" title=\"#hashtag\" class=\"tweet-url hashtag\">#hashtag</a>!"
|
116
|
+
|
117
|
+
- description: "Autolink hashtag followed by Japanese"
|
118
|
+
text: "text #hashtagの"
|
119
|
+
expected: "text <a href=\"http://twitter.com/search?q=%23hashtag\" title=\"#hashtag\" class=\"tweet-url hashtag\">#hashtag</a>の"
|
120
|
+
|
121
|
+
- description: "Autolink hashtag preceded by full-width space (U+3000)"
|
122
|
+
text: "text #hashtag"
|
123
|
+
expected: "text <a href=\"http://twitter.com/search?q=%23hashtag\" title=\"#hashtag\" class=\"tweet-url hashtag\">#hashtag</a>"
|
124
|
+
|
125
|
+
- description: "Autolink hashtag followed by full-width space (U+3000)"
|
126
|
+
text: "#hashtag text"
|
127
|
+
expected: "<a href=\"http://twitter.com/search?q=%23hashtag\" title=\"#hashtag\" class=\"tweet-url hashtag\">#hashtag</a> text"
|
128
|
+
|
129
|
+
- description: "Autolink hashtag with full-width hash (U+FF03)"
|
130
|
+
text: "#hashtag"
|
131
|
+
expected: "<a href=\"http://twitter.com/search?q=%23hashtag\" title=\"#hashtag\" class=\"tweet-url hashtag\">#hashtag</a>"
|
132
|
+
|
133
|
+
urls:
|
134
|
+
- description: "Autolink trailing url"
|
135
|
+
text: "text http://example.com"
|
136
|
+
expected: "text <a href=\"http://example.com\">http://example.com</a>"
|
137
|
+
|
138
|
+
- description: "Autolink url in mid-text"
|
139
|
+
text: "text http://example.com more text"
|
140
|
+
expected: "text <a href=\"http://example.com\">http://example.com</a> more text"
|
141
|
+
|
142
|
+
- description: "Autolink url in Japanese text"
|
143
|
+
text: "いまなにしてるhttp://example.comいまなにしてる"
|
144
|
+
expected: "いまなにしてる<a href=\"http://example.com\">http://example.com</a>いまなにしてる"
|
145
|
+
|
146
|
+
- description: "Autolink url surrounded by parentheses"
|
147
|
+
text: "text (http://example.com)"
|
148
|
+
expected: "text (<a href=\"http://example.com\">http://example.com</a>)"
|
149
|
+
|
150
|
+
- description: "Autolink url containing unicode characters"
|
151
|
+
text: "I enjoy Macintosh Brand computers: http://✪df.ws/ejp"
|
152
|
+
expected: "I enjoy Macintosh Brand computers: <a href=\"http://✪df.ws/ejp\">http://✪df.ws/ejp</a>"
|
153
|
+
|
154
|
+
- description: "DO NOT Autolink url containing ! character in the domain"
|
155
|
+
text: "badly formatted http://foo!bar.com"
|
156
|
+
expected: "badly formatted http://foo!bar.com"
|
157
|
+
|
158
|
+
- description: "DO NOT Autolink url containing _ character in the domain"
|
159
|
+
text: "badly formatted http://foo_bar.com"
|
160
|
+
expected: "badly formatted http://foo_bar.com"
|
161
|
+
|
162
|
+
- description: "Autolink url preceded by :"
|
163
|
+
text: "text:http://example.com"
|
164
|
+
expected: "text:<a href=\"http://example.com\">http://example.com</a>"
|
165
|
+
|
166
|
+
- description: "Autolink url followed by ? (without it)"
|
167
|
+
text: "text http://example.com?"
|
168
|
+
expected: "text <a href=\"http://example.com\">http://example.com</a>?"
|
169
|
+
|
170
|
+
- description: "Autolink url followed by ! (without it)"
|
171
|
+
text: "text http://example.com!"
|
172
|
+
expected: "text <a href=\"http://example.com\">http://example.com</a>!"
|
173
|
+
|
174
|
+
- description: "Autolink url followed by , (without it)"
|
175
|
+
text: "text http://example.com,"
|
176
|
+
expected: "text <a href=\"http://example.com\">http://example.com</a>,"
|
177
|
+
|
178
|
+
- description: "Autolink url followed by . (without it)"
|
179
|
+
text: "text http://example.com."
|
180
|
+
expected: "text <a href=\"http://example.com\">http://example.com</a>."
|
181
|
+
|
182
|
+
- description: "Autolink url followed by : (without it)"
|
183
|
+
text: "text http://example.com:"
|
184
|
+
expected: "text <a href=\"http://example.com\">http://example.com</a>:"
|
185
|
+
|
186
|
+
- description: "Autolink url followed by ; (without it)"
|
187
|
+
text: "text http://example.com;"
|
188
|
+
expected: "text <a href=\"http://example.com\">http://example.com</a>;"
|
189
|
+
|
190
|
+
- description: "Autolink url followed by ] (without it)"
|
191
|
+
text: "text http://example.com]"
|
192
|
+
expected: "text <a href=\"http://example.com\">http://example.com</a>]"
|
193
|
+
|
194
|
+
- description: "Autolink url followed by ) (without it)"
|
195
|
+
text: "text http://example.com)"
|
196
|
+
expected: "text <a href=\"http://example.com\">http://example.com</a>)"
|
197
|
+
|
198
|
+
- description: "Autolink url followed by } (without it)"
|
199
|
+
text: "text http://example.com}"
|
200
|
+
expected: "text <a href=\"http://example.com\">http://example.com</a>}"
|
201
|
+
|
202
|
+
- description: "Autolink url followed by = (without it)"
|
203
|
+
text: "text http://example.com="
|
204
|
+
expected: "text <a href=\"http://example.com\">http://example.com</a>="
|
205
|
+
|
206
|
+
- description: "Autolink url followed by ' (without it)"
|
207
|
+
text: "text http://example.com'"
|
208
|
+
expected: "text <a href=\"http://example.com\">http://example.com</a>'"
|
209
|
+
|
210
|
+
- description: "DO NOT Autolink url preceded by '"
|
211
|
+
text: "text 'http://example.com"
|
212
|
+
expected: "text 'http://example.com"
|
213
|
+
|
214
|
+
- description: "DO NOT Autolink url preceded by /"
|
215
|
+
text: "text /http://example.com"
|
216
|
+
expected: "text /http://example.com"
|
217
|
+
|
218
|
+
- description: "DO NOT Autolink url preceded by !"
|
219
|
+
text: "text !http://example.com"
|
220
|
+
expected: "text !http://example.com"
|
221
|
+
|
222
|
+
- description: "DO NOT Autolink url preceded by ="
|
223
|
+
text: "text =http://example.com"
|
224
|
+
expected: "text =http://example.com"
|
225
|
+
|
226
|
+
- description: "Autolink url embedded in link tag"
|
227
|
+
text: "<link rel='true'>http://example.com</link>"
|
228
|
+
expected: "<link rel='true'><a href=\"http://example.com\">http://example.com</a></link>"
|
229
|
+
|
230
|
+
- description: "Autolink multiple urls"
|
231
|
+
text: "http://example.com https://sslexample.com http://sub.example.com"
|
232
|
+
expected: "<a href=\"http://example.com\">http://example.com</a> <a href=\"https://sslexample.com\">https://sslexample.com</a> <a href=\"http://sub.example.com\">http://sub.example.com</a>"
|
233
|
+
|
234
|
+
- description: "Autolink url with long TLD"
|
235
|
+
text: "http://example.mobi/path"
|
236
|
+
expected: "<a href=\"http://example.mobi/path\">http://example.mobi/path</a>"
|
237
|
+
|
238
|
+
- description: "Autolink url without protocol (with www)"
|
239
|
+
text: "www.example.com"
|
240
|
+
expected: "<a href=\"http://www.example.com\">www.example.com</a>"
|
241
|
+
|
242
|
+
- description: "Autolink url without protocol (with WWW)"
|
243
|
+
text: "WWW.EXAMPLE.COM"
|
244
|
+
expected: "<a href=\"http://WWW.EXAMPLE.COM\">WWW.EXAMPLE.COM</a>"
|
245
|
+
|
246
|
+
all:
|
247
|
+
- description: "Autolink url does not overlap @username"
|
248
|
+
text: "Check out: http://example.com/test&@chasesechrist"
|
249
|
+
expected: "Check out: <a href=\"http://example.com/test&\">http://example.com/test&</a>@<a class=\"tweet-url username\" href=\"http://twitter.com/chasesechrist\">chasesechrist</a>"
|
250
|
+
|
@@ -0,0 +1,193 @@
|
|
1
|
+
|
2
|
+
tests:
|
3
|
+
mentions:
|
4
|
+
- description: "Extract mention at the begining of a tweet"
|
5
|
+
text: "@username reply"
|
6
|
+
expected: ["username"]
|
7
|
+
|
8
|
+
- description: "Extract mention at the end of a tweet"
|
9
|
+
text: "mention @username"
|
10
|
+
expected: ["username"]
|
11
|
+
|
12
|
+
- description: "Extract mention in the middle of a tweet"
|
13
|
+
text: "mention @username in the middle"
|
14
|
+
expected: ["username"]
|
15
|
+
|
16
|
+
- description: "Extract mention of username with underscore"
|
17
|
+
text: "mention @user_name"
|
18
|
+
expected: ["user_name"]
|
19
|
+
|
20
|
+
- description: "Extract mention of all numeric username"
|
21
|
+
text: "mention @12345"
|
22
|
+
expected: ["12345"]
|
23
|
+
|
24
|
+
- description: "Extract mention or multiple usernames"
|
25
|
+
text: "mention @username1 @username2"
|
26
|
+
expected: ["username1", "username2"]
|
27
|
+
|
28
|
+
- description: "Extract mention in the middle of a Japanese tweet"
|
29
|
+
text: "の@usernameに到着を待っている"
|
30
|
+
expected: ["username"]
|
31
|
+
|
32
|
+
- description: "DO NOT extract username ending in @"
|
33
|
+
text: "Current Status: @_@ (cc: @username)"
|
34
|
+
expected: ["username"]
|
35
|
+
|
36
|
+
- description: "Extract lone metion but not @user@user (too close to an email)"
|
37
|
+
text: "@username email me @test@example.com"
|
38
|
+
expected: ["username"]
|
39
|
+
|
40
|
+
replies:
|
41
|
+
- description: "Extract reply at the begining of a tweet"
|
42
|
+
text: "@username reply"
|
43
|
+
expected: "username"
|
44
|
+
|
45
|
+
- description: "Extract reply preceded by only a space"
|
46
|
+
text: " @username reply"
|
47
|
+
expected: "username"
|
48
|
+
|
49
|
+
- description: "Extract reply preceded by only a full-width space (U+3000)"
|
50
|
+
text: " @username reply"
|
51
|
+
expected: "username"
|
52
|
+
|
53
|
+
- description: "DO NOT Extract reply when preceded by text"
|
54
|
+
text: "a @username mention, not a reply"
|
55
|
+
expected:
|
56
|
+
|
57
|
+
- description: "DO NOT Extract reply when preceded by ."
|
58
|
+
text: ".@username mention, not a reply"
|
59
|
+
expected:
|
60
|
+
|
61
|
+
- description: "DO NOT Extract reply when preceded by /"
|
62
|
+
text: "/@username mention, not a reply"
|
63
|
+
expected:
|
64
|
+
|
65
|
+
- description: "DO NOT Extract reply when preceded by _"
|
66
|
+
text: "_@username mention, not a reply"
|
67
|
+
expected:
|
68
|
+
|
69
|
+
- description: "DO NOT Extract reply when preceded by -"
|
70
|
+
text: "-@username mention, not a reply"
|
71
|
+
expected:
|
72
|
+
|
73
|
+
- description: "DO NOT Extract reply when preceded by +"
|
74
|
+
text: "+@username mention, not a reply"
|
75
|
+
expected:
|
76
|
+
|
77
|
+
- description: "DO NOT Extract reply when preceded by #"
|
78
|
+
text: "#@username mention, not a reply"
|
79
|
+
expected:
|
80
|
+
|
81
|
+
- description: "DO NOT Extract reply when preceded by !"
|
82
|
+
text: "!@username mention, not a reply"
|
83
|
+
expected:
|
84
|
+
|
85
|
+
- description: "DO NOT Extract reply when preceded by @"
|
86
|
+
text: "@@username mention, not a reply"
|
87
|
+
expected:
|
88
|
+
|
89
|
+
urls:
|
90
|
+
- description: "Extract a lone URL"
|
91
|
+
text: "http://example.com"
|
92
|
+
expected: ["http://example.com"]
|
93
|
+
|
94
|
+
- description: "Extract valid URL: http://google.com"
|
95
|
+
text: "text http://google.com"
|
96
|
+
expected: ["http://google.com"]
|
97
|
+
|
98
|
+
- description: "Extract valid URL: http://foobar.com/#"
|
99
|
+
text: "text http://foobar.com/#"
|
100
|
+
expected: ["http://foobar.com/#"]
|
101
|
+
|
102
|
+
- description: "Extract valid URL: http://google.com/#foo"
|
103
|
+
text: "text http://google.com/#foo"
|
104
|
+
expected: ["http://google.com/#foo"]
|
105
|
+
|
106
|
+
- description: "Extract valid URL: http://google.com/#search?q=iphone%20-filter%3Alinks"
|
107
|
+
text: "text http://google.com/#search?q=iphone%20-filter%3Alinks"
|
108
|
+
expected: ["http://google.com/#search?q=iphone%20-filter%3Alinks"]
|
109
|
+
|
110
|
+
- description: "Extract valid URL: http://twitter.com/#search?q=iphone%20-filter%3Alinks"
|
111
|
+
text: "text http://twitter.com/#search?q=iphone%20-filter%3Alinks"
|
112
|
+
expected: ["http://twitter.com/#search?q=iphone%20-filter%3Alinks"]
|
113
|
+
|
114
|
+
- description: "Extract valid URL: http://www.boingboing.net/2007/02/14/katamari_damacy_phon.html"
|
115
|
+
text: "text http://www.boingboing.net/2007/02/14/katamari_damacy_phon.html"
|
116
|
+
expected: ["http://www.boingboing.net/2007/02/14/katamari_damacy_phon.html"]
|
117
|
+
|
118
|
+
- description: "Extract valid URL: http://somehost.com:3000"
|
119
|
+
text: "text http://somehost.com:3000"
|
120
|
+
expected: ["http://somehost.com:3000"]
|
121
|
+
|
122
|
+
- description: "Extract valid URL: http://x.com/~matthew+%-x"
|
123
|
+
text: "text http://x.com/~matthew+%-x"
|
124
|
+
expected: ["http://x.com/~matthew+%-x"]
|
125
|
+
|
126
|
+
- description: "Extract valid URL: http://x.com/~matthew+%-,.;x"
|
127
|
+
text: "text http://x.com/~matthew+%-,.;x"
|
128
|
+
expected: ["http://x.com/~matthew+%-,.;x"]
|
129
|
+
|
130
|
+
- description: "Extract valid URL: http://x.com/,.;x"
|
131
|
+
text: "text http://x.com/,.;x"
|
132
|
+
expected: ["http://x.com/,.;x"]
|
133
|
+
|
134
|
+
- description: "Extract valid URL: http://en.wikipedia.org/wiki/Primer_(film)"
|
135
|
+
text: "text http://en.wikipedia.org/wiki/Primer_(film)"
|
136
|
+
expected: ["http://en.wikipedia.org/wiki/Primer_(film)"]
|
137
|
+
|
138
|
+
- description: "Extract valid URL: http://www.ams.org/bookstore-getitem/item=mbk-59"
|
139
|
+
text: "text http://www.ams.org/bookstore-getitem/item=mbk-59"
|
140
|
+
expected: ["http://www.ams.org/bookstore-getitem/item=mbk-59"]
|
141
|
+
|
142
|
+
- description: "Extract valid URL: http://✪df.ws/ejp"
|
143
|
+
text: "text http://✪df.ws/ejp"
|
144
|
+
expected: ["http://✪df.ws/ejp"]
|
145
|
+
|
146
|
+
- description: "Extract valid URL: http://chilp.it/?77e8fd"
|
147
|
+
text: "text http://chilp.it/?77e8fd"
|
148
|
+
expected: ["http://chilp.it/?77e8fd"]
|
149
|
+
|
150
|
+
- description: "DO NOT extract invalid URL: http://doman-dash_2314352345_dfasd.foo-cow_4352.com"
|
151
|
+
text: "text http://doman-dash_2314352345_dfasd.foo-cow_4352.com"
|
152
|
+
expected: []
|
153
|
+
|
154
|
+
- description: "DO NOT extract invalid URL: http://no-tld"
|
155
|
+
text: "text http://no-tld"
|
156
|
+
expected: []
|
157
|
+
|
158
|
+
- description: "DO NOT extract invalid URL: http://tld-too-short.x"
|
159
|
+
text: "text http://tld-too-short.x"
|
160
|
+
expected: []
|
161
|
+
|
162
|
+
hashtags:
|
163
|
+
- description: "Extract an all-alpha hashtag"
|
164
|
+
text: "a #hashtag here"
|
165
|
+
expected: ["hashtag"]
|
166
|
+
|
167
|
+
- description: "Extract a letter-then-number hashtag"
|
168
|
+
text: "this is #hashtag1"
|
169
|
+
expected: ["hashtag1"]
|
170
|
+
|
171
|
+
- description: "Extract a number-then-letter hashtag"
|
172
|
+
text: "#1hashtag is this"
|
173
|
+
expected: ["1hashtag"]
|
174
|
+
|
175
|
+
- description: "DO NOT Extract an all-numeric hashtag"
|
176
|
+
text: "On the #16 bus"
|
177
|
+
expected: []
|
178
|
+
|
179
|
+
- description: "Extract a hashtag containing ñ"
|
180
|
+
text: "I'll write more tests #mañana"
|
181
|
+
expected: ["mañana"]
|
182
|
+
|
183
|
+
- description: "Extract a hashtag containing é"
|
184
|
+
text: "Working remotely #café"
|
185
|
+
expected: ["café"]
|
186
|
+
|
187
|
+
- description: "Extract a hashtag containing ü"
|
188
|
+
text: "Getting my Oktoberfest on #münchen"
|
189
|
+
expected: ["münchen"]
|
190
|
+
|
191
|
+
- description: "DO NOT Extract a hashtag containing Japanese"
|
192
|
+
text: "this is not valid: # 会議中 ハッシュ"
|
193
|
+
expected: []
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tweetparser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Paul Battley
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2010-02-
|
12
|
+
date: 2010-02-24 00:00:00 +00:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
@@ -51,7 +51,11 @@ extensions: []
|
|
51
51
|
extra_rdoc_files: []
|
52
52
|
|
53
53
|
files:
|
54
|
+
- test/conformance_test.rb
|
54
55
|
- test/parser_test.rb
|
56
|
+
- test/twitter-text-conformance/autolink.yml
|
57
|
+
- test/twitter-text-conformance/extract.yml
|
58
|
+
- test/twitter-text-conformance/README
|
55
59
|
- lib/tweetparser/grammar.treetop
|
56
60
|
- lib/tweetparser.rb
|
57
61
|
has_rdoc: true
|