tweetparser 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/tweetparser.rb +12 -0
- data/lib/tweetparser/grammar.treetop +37 -9
- data/test/conformance_test.rb +48 -0
- data/test/parser_test.rb +30 -9
- data/test/twitter-text-conformance/README +6 -0
- data/test/twitter-text-conformance/autolink.yml +250 -0
- data/test/twitter-text-conformance/extract.yml +193 -0
- metadata +6 -2
data/lib/tweetparser.rb
CHANGED
@@ -1,3 +1,15 @@
|
|
1
1
|
require "treetop"
|
2
2
|
require "polyglot"
|
3
3
|
require "tweetparser/grammar"
|
4
|
+
|
5
|
+
module TweetParser
|
6
|
+
def self.parse(input)
|
7
|
+
kcode = $KCODE
|
8
|
+
$KCODE = "n"
|
9
|
+
parser = TweetContentParser.new
|
10
|
+
parsed = parser.parse(input)
|
11
|
+
$KCODE = kcode
|
12
|
+
return nil unless parsed
|
13
|
+
parsed.content
|
14
|
+
end
|
15
|
+
end
|
@@ -1,38 +1,66 @@
|
|
1
1
|
grammar TweetContent
|
2
2
|
rule tweet
|
3
|
-
(url / html / space / newline /
|
3
|
+
(url / html / space / newline / list / username / hashtag / slash / text)* {
|
4
4
|
def content
|
5
5
|
elements.map{ |e| e.content }
|
6
6
|
end
|
7
7
|
}
|
8
8
|
end
|
9
9
|
|
10
|
+
rule subdomain
|
11
|
+
([a-zA-Z0-9\-] / [^\x20-\x7F])+
|
12
|
+
end
|
13
|
+
|
10
14
|
rule url
|
11
|
-
"http" "
|
15
|
+
(("http" / "HTTP") [sS]? "://" / "www." / "WWW.") subdomain ("." subdomain)+ ("/" [\.a-zA-Z0-9\?#=\-_&%]*)* {
|
12
16
|
def content
|
13
17
|
[:url, text_value]
|
14
18
|
end
|
15
19
|
}
|
16
20
|
end
|
17
21
|
|
18
|
-
rule
|
19
|
-
|
22
|
+
rule name
|
23
|
+
[a-zA-Z0-9_]+
|
24
|
+
end
|
25
|
+
|
26
|
+
rule name_with_letters
|
27
|
+
[a-zA-Z] [a-zA-Z0-9_]* / [0-9_]+ [a-zA-Z] [a-zA-Z0-9_]*
|
28
|
+
end
|
29
|
+
|
30
|
+
rule username
|
31
|
+
("@" / "@") name {
|
32
|
+
def content
|
33
|
+
[:username, text_value]
|
34
|
+
end
|
35
|
+
}
|
36
|
+
end
|
37
|
+
|
38
|
+
rule list
|
39
|
+
username "/" name {
|
20
40
|
def content
|
21
|
-
[:
|
41
|
+
[:list, text_value]
|
22
42
|
end
|
23
43
|
}
|
24
44
|
end
|
25
45
|
|
26
46
|
rule hashtag
|
27
|
-
"#"
|
47
|
+
("#" / "#") name_with_letters {
|
28
48
|
def content
|
29
49
|
[:hashtag, text_value]
|
30
50
|
end
|
31
51
|
}
|
32
52
|
end
|
33
53
|
|
54
|
+
rule slash
|
55
|
+
"/" name {
|
56
|
+
def content
|
57
|
+
[:slash, text_value]
|
58
|
+
end
|
59
|
+
}
|
60
|
+
end
|
61
|
+
|
34
62
|
rule text
|
35
|
-
|
63
|
+
[\S]+ {
|
36
64
|
def content
|
37
65
|
[:text, text_value]
|
38
66
|
end
|
@@ -50,13 +78,13 @@ grammar TweetContent
|
|
50
78
|
rule newline
|
51
79
|
"\r"? "\n" {
|
52
80
|
def content
|
53
|
-
[:newline]
|
81
|
+
[:newline, text_value]
|
54
82
|
end
|
55
83
|
}
|
56
84
|
end
|
57
85
|
|
58
86
|
rule space
|
59
|
-
" "+ {
|
87
|
+
(" " / " ")+ {
|
60
88
|
def content
|
61
89
|
[:space, text_value]
|
62
90
|
end
|
@@ -0,0 +1,48 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
$:.unshift(File.expand_path("../../lib", __FILE__))
|
3
|
+
require "test/unit"
|
4
|
+
require "shoulda"
|
5
|
+
require "tweetparser"
|
6
|
+
require "yaml"
|
7
|
+
require "cgi"
|
8
|
+
|
9
|
+
class AutolinkConformanceTest < Test::Unit::TestCase
|
10
|
+
DATA_PATH = File.expand_path("../twitter-text-conformance/autolink.yml", __FILE__)
|
11
|
+
|
12
|
+
def assert_autolink(expected, input)
|
13
|
+
sexpr = TweetParser.parse(input)
|
14
|
+
assert sexpr, "Failed to parse #{input}"
|
15
|
+
actual = sexpr.inject(""){ |output, (type, value)|
|
16
|
+
output <<
|
17
|
+
case type
|
18
|
+
when :username
|
19
|
+
at, username = value.scan(/^.|.*/)
|
20
|
+
%{#{at}<a class="tweet-url username" href="http://twitter.com/#{username}">#{username}</a>}
|
21
|
+
when :list
|
22
|
+
at, list = value.scan(/^.|.*/)
|
23
|
+
%{#{at}<a class="tweet-url list-slug" href=\"http://twitter.com/#{list}">#{list}</a>}
|
24
|
+
when :hashtag
|
25
|
+
hash, hashtag = value.scan(/^.|.*/)
|
26
|
+
%{<a href="http://twitter.com/search?q=%23#{hashtag}" }+
|
27
|
+
%{title="\##{hashtag}" class="tweet-url hashtag">#{value}</a>}
|
28
|
+
when :url
|
29
|
+
href = value
|
30
|
+
href = "http://" + value unless href =~ /^http/i
|
31
|
+
%{<a href="#{href}">#{value}</a>}
|
32
|
+
else
|
33
|
+
value
|
34
|
+
end
|
35
|
+
}
|
36
|
+
assert_equal expected, actual, sexpr.inspect
|
37
|
+
end
|
38
|
+
|
39
|
+
YAML.load(File.read(DATA_PATH))["tests"].each do |section, tests|
|
40
|
+
context "when testing #{section}" do
|
41
|
+
tests.each do |hash|
|
42
|
+
should hash["description"] do
|
43
|
+
assert_autolink hash["expected"], hash["text"]
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
data/test/parser_test.rb
CHANGED
@@ -6,12 +6,8 @@ require "tweetparser"
|
|
6
6
|
|
7
7
|
class ParserTest < Test::Unit::TestCase
|
8
8
|
|
9
|
-
def setup
|
10
|
-
@parser = TweetContentParser.new
|
11
|
-
end
|
12
|
-
|
13
9
|
def assert_parses(expected, input)
|
14
|
-
actual =
|
10
|
+
actual = TweetParser.parse(input)
|
15
11
|
assert_equal expected, actual
|
16
12
|
end
|
17
13
|
|
@@ -24,6 +20,21 @@ class ParserTest < Test::Unit::TestCase
|
|
24
20
|
assert_parses [[:url, s]], s
|
25
21
|
end
|
26
22
|
|
23
|
+
should "extract url with www and no http" do
|
24
|
+
s = "www.example.com/mail/?ui=2&shva=1#inbox"
|
25
|
+
assert_parses [[:url, s]], s
|
26
|
+
end
|
27
|
+
|
28
|
+
should "extract IDN url" do
|
29
|
+
s = "http://✪df.ws/ejp"
|
30
|
+
assert_parses [[:url, s]], s
|
31
|
+
end
|
32
|
+
|
33
|
+
should "not extract invalid domain" do
|
34
|
+
s = "http://example_com/mail/?ui=2&shva=1#inbox"
|
35
|
+
assert_parses [[:text, s]], s
|
36
|
+
end
|
37
|
+
|
27
38
|
should "extract hashtag" do
|
28
39
|
s = "#HashTag2010"
|
29
40
|
assert_parses [[:hashtag, s]], s
|
@@ -31,7 +42,7 @@ class ParserTest < Test::Unit::TestCase
|
|
31
42
|
|
32
43
|
should "extract at-references" do
|
33
44
|
s = "@AtRef_3000"
|
34
|
-
assert_parses [[:
|
45
|
+
assert_parses [[:username, s]], s
|
35
46
|
end
|
36
47
|
|
37
48
|
should "extract HTML" do
|
@@ -39,9 +50,19 @@ class ParserTest < Test::Unit::TestCase
|
|
39
50
|
assert_parses [[:html, s]], s
|
40
51
|
end
|
41
52
|
|
53
|
+
should "extract a slash comment" do
|
54
|
+
s = %{/via}
|
55
|
+
assert_parses [[:slash, s]], s
|
56
|
+
end
|
57
|
+
|
58
|
+
should "extract a list" do
|
59
|
+
s = %{@username/list}
|
60
|
+
assert_parses [[:list, s]], s
|
61
|
+
end
|
62
|
+
|
42
63
|
should "extract words spaces and new lines" do
|
43
64
|
s = "this string\nhas spaces!"
|
44
|
-
expected = [[:text, "this"], [:space, " "], [:text, "string"], [:newline],
|
65
|
+
expected = [[:text, "this"], [:space, " "], [:text, "string"], [:newline, "\n"],
|
45
66
|
[:text, "has"], [:space, " "], [:text, "spaces!"]]
|
46
67
|
assert_parses expected, s
|
47
68
|
end
|
@@ -51,7 +72,7 @@ class ParserTest < Test::Unit::TestCase
|
|
51
72
|
expected = [[:text, "Another"], [:space, " "], [:text, "test:"], [:space, " "],
|
52
73
|
[:html, "<a href=\"http://twitpic.com/14vzny\" target=\"_blank\">"],
|
53
74
|
[:html, "<img src=\"http://twitpic.com/show/mini/14vzny\" />"],
|
54
|
-
[:html, "</a>"], [:newline],
|
75
|
+
[:html, "</a>"], [:newline, "\n"],
|
55
76
|
[:url, "http://twitpic.com/14vzny"],
|
56
77
|
[:space, " "], [:text, "3"], [:space, " "],
|
57
78
|
[:url, "http://twitpic.com/14vzny"]]
|
@@ -60,7 +81,7 @@ class ParserTest < Test::Unit::TestCase
|
|
60
81
|
|
61
82
|
should "extract elements from real-world sample" do
|
62
83
|
s = %{RT @newsbrooke Tonight’s the night!: Hope you’ll all tune in tonight to watch On Expenses at 9pm on BBC4 http://bit.ly/cgbkmF #mps #uk}
|
63
|
-
expected = [[:text, "RT"], [:space, " "], [:
|
84
|
+
expected = [[:text, "RT"], [:space, " "], [:username, "@newsbrooke"], [:space, " "],
|
64
85
|
[:text, "Tonight’s"], [:space, " "], [:text, "the"], [:space, " "],
|
65
86
|
[:text, "night!:"], [:space, " "], [:text, "Hope"], [:space, " "],
|
66
87
|
[:text, "you’ll"], [:space, " "], [:text, "all"], [:space, " "],
|
@@ -0,0 +1,250 @@
|
|
1
|
+
|
2
|
+
tests:
|
3
|
+
usernames:
|
4
|
+
- description: "Autolink trailing username"
|
5
|
+
text: "text @username"
|
6
|
+
expected: "text @<a class=\"tweet-url username\" href=\"http://twitter.com/username\">username</a>"
|
7
|
+
|
8
|
+
- description: "Autolink username at the beginning"
|
9
|
+
text: "@username text"
|
10
|
+
expected: "@<a class=\"tweet-url username\" href=\"http://twitter.com/username\">username</a> text"
|
11
|
+
|
12
|
+
- description: "DO NOT Autolink username preceded by a letter"
|
13
|
+
text: "meet@the beach"
|
14
|
+
expected: "meet@the beach"
|
15
|
+
|
16
|
+
- description: "Autolink username preceded by puctuation"
|
17
|
+
text: "great.@username"
|
18
|
+
expected: "great.@<a class=\"tweet-url username\" href=\"http://twitter.com/username\">username</a>"
|
19
|
+
|
20
|
+
- description: "Autolink username followed by puctuation"
|
21
|
+
text: "@username&^$%^"
|
22
|
+
expected: "@<a class=\"tweet-url username\" href=\"http://twitter.com/username\">username</a>&^$%^"
|
23
|
+
|
24
|
+
- description: "Autolink username followed by Japanese"
|
25
|
+
text: "@usernameの"
|
26
|
+
expected: "@<a class=\"tweet-url username\" href=\"http://twitter.com/username\">username</a>の"
|
27
|
+
|
28
|
+
- description: "Autolink username preceded by Japanese"
|
29
|
+
text: "あ@username"
|
30
|
+
expected: "あ@<a class=\"tweet-url username\" href=\"http://twitter.com/username\">username</a>"
|
31
|
+
|
32
|
+
- description: "Autolink username surrounded by Japanese"
|
33
|
+
text: "あ@usernameの"
|
34
|
+
expected: "あ@<a class=\"tweet-url username\" href=\"http://twitter.com/username\">username</a>の"
|
35
|
+
|
36
|
+
- description: "Autolink username with full-width at sign (U+FF20)"
|
37
|
+
text: "@username"
|
38
|
+
expected: "@<a class=\"tweet-url username\" href=\"http://twitter.com/username\">username</a>"
|
39
|
+
|
40
|
+
- description: "DO NOT Autolink username over 20 characters"
|
41
|
+
text: "@username9012345678901"
|
42
|
+
expected: "@<a class=\"tweet-url username\" href=\"http://twitter.com/username901234567890\">username901234567890</a>1"
|
43
|
+
lists:
|
44
|
+
- description: "Autolink list preceded by a space"
|
45
|
+
text: "text @username/list"
|
46
|
+
expected: "text @<a class=\"tweet-url list-slug\" href=\"http://twitter.com/username/list\">username/list</a>"
|
47
|
+
|
48
|
+
- description: "DO NOT Autolink list when space follows slash"
|
49
|
+
text: "text @username/ list"
|
50
|
+
expected: "text @<a class=\"tweet-url username\" href=\"http://twitter.com/username\">username</a>/ list"
|
51
|
+
|
52
|
+
- description: "DO NOT Autolink list with empty username"
|
53
|
+
text: "text @/list"
|
54
|
+
expected: "text @/list"
|
55
|
+
|
56
|
+
- description: "Autolink list at the beginning"
|
57
|
+
text: "@username/list"
|
58
|
+
expected: "@<a class=\"tweet-url list-slug\" href=\"http://twitter.com/username/list\">username/list</a>"
|
59
|
+
|
60
|
+
- description: "DO NOT Autolink list preceded by letter"
|
61
|
+
text: "meet@the/beach"
|
62
|
+
expected: "meet@the/beach"
|
63
|
+
|
64
|
+
- description: "Autolink list preceded by puctuation"
|
65
|
+
text: "great.@username/list"
|
66
|
+
expected: "great.@<a class=\"tweet-url list-slug\" href=\"http://twitter.com/username/list\">username/list</a>"
|
67
|
+
|
68
|
+
- description: "Autolink list followed by puctuation"
|
69
|
+
text: "@username/list&^$%^"
|
70
|
+
expected: "@<a class=\"tweet-url list-slug\" href=\"http://twitter.com/username/list\">username/list</a>&^$%^"
|
71
|
+
|
72
|
+
- description: "Autolink list name over 80 characters (truncated to 80)"
|
73
|
+
text: "@username/list5678901234567890123456789012345678901234567890123456789012345678901234567890A"
|
74
|
+
expected: "@<a class=\"tweet-url list-slug\" href=\"http://twitter.com/username/list5678901234567890123456789012345678901234567890123456789012345678901234567890\">username/list5678901234567890123456789012345678901234567890123456789012345678901234567890</a>A"
|
75
|
+
|
76
|
+
hashtags:
|
77
|
+
- description: "Autolink trailing hashtag"
|
78
|
+
text: "text #hashtag"
|
79
|
+
expected: "text <a href=\"http://twitter.com/search?q=%23hashtag\" title=\"#hashtag\" class=\"tweet-url hashtag\">#hashtag</a>"
|
80
|
+
|
81
|
+
- description: "Autolink alphanumeric hashtag (letter-number-letter)"
|
82
|
+
text: "text #hash0tag"
|
83
|
+
expected: "text <a href=\"http://twitter.com/search?q=%23hash0tag\" title=\"#hash0tag\" class=\"tweet-url hashtag\">#hash0tag</a>"
|
84
|
+
|
85
|
+
- description: "Autolink alphanumeric hashtag (number-letter)"
|
86
|
+
text: "text #1tag"
|
87
|
+
expected: "text <a href=\"http://twitter.com/search?q=%231tag\" title=\"#1tag\" class=\"tweet-url hashtag\">#1tag</a>"
|
88
|
+
|
89
|
+
- description: "Autolink hashtag with underscore"
|
90
|
+
text: "text #hash_tag"
|
91
|
+
expected: "text <a href=\"http://twitter.com/search?q=%23hash_tag\" title=\"#hash_tag\" class=\"tweet-url hashtag\">#hash_tag</a>"
|
92
|
+
|
93
|
+
- description: "DO NOT Autolink all-numeric hashtags"
|
94
|
+
text: "text #1234"
|
95
|
+
expected: "text #1234"
|
96
|
+
|
97
|
+
- description: "DO NOT Autolink hashtag preceded by a letter"
|
98
|
+
text: "text#hashtag"
|
99
|
+
expected: "text#hashtag"
|
100
|
+
|
101
|
+
- description: "Autolink multiple hashtags"
|
102
|
+
text: "text #hashtag1 #hashtag2"
|
103
|
+
expected: "text <a href=\"http://twitter.com/search?q=%23hashtag1\" title=\"#hashtag1\" class=\"tweet-url hashtag\">#hashtag1</a> <a href=\"http://twitter.com/search?q=%23hashtag2\" title=\"#hashtag2\" class=\"tweet-url hashtag\">#hashtag2</a>"
|
104
|
+
|
105
|
+
- description: "Autolink hashtag preceded by a period"
|
106
|
+
text: "text.#hashtag"
|
107
|
+
expected: "text.<a href=\"http://twitter.com/search?q=%23hashtag\" title=\"#hashtag\" class=\"tweet-url hashtag\">#hashtag</a>"
|
108
|
+
|
109
|
+
- description: "DO NOT Autolink hashtag preceded by &"
|
110
|
+
text: "&#nbsp;"
|
111
|
+
expected: "&#nbsp;"
|
112
|
+
|
113
|
+
- description: "Autolink hashtag followed by ! (! not included)"
|
114
|
+
text: "text #hashtag!"
|
115
|
+
expected: "text <a href=\"http://twitter.com/search?q=%23hashtag\" title=\"#hashtag\" class=\"tweet-url hashtag\">#hashtag</a>!"
|
116
|
+
|
117
|
+
- description: "Autolink hashtag followed by Japanese"
|
118
|
+
text: "text #hashtagの"
|
119
|
+
expected: "text <a href=\"http://twitter.com/search?q=%23hashtag\" title=\"#hashtag\" class=\"tweet-url hashtag\">#hashtag</a>の"
|
120
|
+
|
121
|
+
- description: "Autolink hashtag preceded by full-width space (U+3000)"
|
122
|
+
text: "text #hashtag"
|
123
|
+
expected: "text <a href=\"http://twitter.com/search?q=%23hashtag\" title=\"#hashtag\" class=\"tweet-url hashtag\">#hashtag</a>"
|
124
|
+
|
125
|
+
- description: "Autolink hashtag followed by full-width space (U+3000)"
|
126
|
+
text: "#hashtag text"
|
127
|
+
expected: "<a href=\"http://twitter.com/search?q=%23hashtag\" title=\"#hashtag\" class=\"tweet-url hashtag\">#hashtag</a> text"
|
128
|
+
|
129
|
+
- description: "Autolink hashtag with full-width hash (U+FF03)"
|
130
|
+
text: "#hashtag"
|
131
|
+
expected: "<a href=\"http://twitter.com/search?q=%23hashtag\" title=\"#hashtag\" class=\"tweet-url hashtag\">#hashtag</a>"
|
132
|
+
|
133
|
+
urls:
|
134
|
+
- description: "Autolink trailing url"
|
135
|
+
text: "text http://example.com"
|
136
|
+
expected: "text <a href=\"http://example.com\">http://example.com</a>"
|
137
|
+
|
138
|
+
- description: "Autolink url in mid-text"
|
139
|
+
text: "text http://example.com more text"
|
140
|
+
expected: "text <a href=\"http://example.com\">http://example.com</a> more text"
|
141
|
+
|
142
|
+
- description: "Autolink url in Japanese text"
|
143
|
+
text: "いまなにしてるhttp://example.comいまなにしてる"
|
144
|
+
expected: "いまなにしてる<a href=\"http://example.com\">http://example.com</a>いまなにしてる"
|
145
|
+
|
146
|
+
- description: "Autolink url surrounded by parentheses"
|
147
|
+
text: "text (http://example.com)"
|
148
|
+
expected: "text (<a href=\"http://example.com\">http://example.com</a>)"
|
149
|
+
|
150
|
+
- description: "Autolink url containing unicode characters"
|
151
|
+
text: "I enjoy Macintosh Brand computers: http://✪df.ws/ejp"
|
152
|
+
expected: "I enjoy Macintosh Brand computers: <a href=\"http://✪df.ws/ejp\">http://✪df.ws/ejp</a>"
|
153
|
+
|
154
|
+
- description: "DO NOT Autolink url containing ! character in the domain"
|
155
|
+
text: "badly formatted http://foo!bar.com"
|
156
|
+
expected: "badly formatted http://foo!bar.com"
|
157
|
+
|
158
|
+
- description: "DO NOT Autolink url containing _ character in the domain"
|
159
|
+
text: "badly formatted http://foo_bar.com"
|
160
|
+
expected: "badly formatted http://foo_bar.com"
|
161
|
+
|
162
|
+
- description: "Autolink url preceded by :"
|
163
|
+
text: "text:http://example.com"
|
164
|
+
expected: "text:<a href=\"http://example.com\">http://example.com</a>"
|
165
|
+
|
166
|
+
- description: "Autolink url followed by ? (without it)"
|
167
|
+
text: "text http://example.com?"
|
168
|
+
expected: "text <a href=\"http://example.com\">http://example.com</a>?"
|
169
|
+
|
170
|
+
- description: "Autolink url followed by ! (without it)"
|
171
|
+
text: "text http://example.com!"
|
172
|
+
expected: "text <a href=\"http://example.com\">http://example.com</a>!"
|
173
|
+
|
174
|
+
- description: "Autolink url followed by , (without it)"
|
175
|
+
text: "text http://example.com,"
|
176
|
+
expected: "text <a href=\"http://example.com\">http://example.com</a>,"
|
177
|
+
|
178
|
+
- description: "Autolink url followed by . (without it)"
|
179
|
+
text: "text http://example.com."
|
180
|
+
expected: "text <a href=\"http://example.com\">http://example.com</a>."
|
181
|
+
|
182
|
+
- description: "Autolink url followed by : (without it)"
|
183
|
+
text: "text http://example.com:"
|
184
|
+
expected: "text <a href=\"http://example.com\">http://example.com</a>:"
|
185
|
+
|
186
|
+
- description: "Autolink url followed by ; (without it)"
|
187
|
+
text: "text http://example.com;"
|
188
|
+
expected: "text <a href=\"http://example.com\">http://example.com</a>;"
|
189
|
+
|
190
|
+
- description: "Autolink url followed by ] (without it)"
|
191
|
+
text: "text http://example.com]"
|
192
|
+
expected: "text <a href=\"http://example.com\">http://example.com</a>]"
|
193
|
+
|
194
|
+
- description: "Autolink url followed by ) (without it)"
|
195
|
+
text: "text http://example.com)"
|
196
|
+
expected: "text <a href=\"http://example.com\">http://example.com</a>)"
|
197
|
+
|
198
|
+
- description: "Autolink url followed by } (without it)"
|
199
|
+
text: "text http://example.com}"
|
200
|
+
expected: "text <a href=\"http://example.com\">http://example.com</a>}"
|
201
|
+
|
202
|
+
- description: "Autolink url followed by = (without it)"
|
203
|
+
text: "text http://example.com="
|
204
|
+
expected: "text <a href=\"http://example.com\">http://example.com</a>="
|
205
|
+
|
206
|
+
- description: "Autolink url followed by ' (without it)"
|
207
|
+
text: "text http://example.com'"
|
208
|
+
expected: "text <a href=\"http://example.com\">http://example.com</a>'"
|
209
|
+
|
210
|
+
- description: "DO NOT Autolink url preceded by '"
|
211
|
+
text: "text 'http://example.com"
|
212
|
+
expected: "text 'http://example.com"
|
213
|
+
|
214
|
+
- description: "DO NOT Autolink url preceded by /"
|
215
|
+
text: "text /http://example.com"
|
216
|
+
expected: "text /http://example.com"
|
217
|
+
|
218
|
+
- description: "DO NOT Autolink url preceded by !"
|
219
|
+
text: "text !http://example.com"
|
220
|
+
expected: "text !http://example.com"
|
221
|
+
|
222
|
+
- description: "DO NOT Autolink url preceded by ="
|
223
|
+
text: "text =http://example.com"
|
224
|
+
expected: "text =http://example.com"
|
225
|
+
|
226
|
+
- description: "Autolink url embedded in link tag"
|
227
|
+
text: "<link rel='true'>http://example.com</link>"
|
228
|
+
expected: "<link rel='true'><a href=\"http://example.com\">http://example.com</a></link>"
|
229
|
+
|
230
|
+
- description: "Autolink multiple urls"
|
231
|
+
text: "http://example.com https://sslexample.com http://sub.example.com"
|
232
|
+
expected: "<a href=\"http://example.com\">http://example.com</a> <a href=\"https://sslexample.com\">https://sslexample.com</a> <a href=\"http://sub.example.com\">http://sub.example.com</a>"
|
233
|
+
|
234
|
+
- description: "Autolink url with long TLD"
|
235
|
+
text: "http://example.mobi/path"
|
236
|
+
expected: "<a href=\"http://example.mobi/path\">http://example.mobi/path</a>"
|
237
|
+
|
238
|
+
- description: "Autolink url without protocol (with www)"
|
239
|
+
text: "www.example.com"
|
240
|
+
expected: "<a href=\"http://www.example.com\">www.example.com</a>"
|
241
|
+
|
242
|
+
- description: "Autolink url without protocol (with WWW)"
|
243
|
+
text: "WWW.EXAMPLE.COM"
|
244
|
+
expected: "<a href=\"http://WWW.EXAMPLE.COM\">WWW.EXAMPLE.COM</a>"
|
245
|
+
|
246
|
+
all:
|
247
|
+
- description: "Autolink url does not overlap @username"
|
248
|
+
text: "Check out: http://example.com/test&@chasesechrist"
|
249
|
+
expected: "Check out: <a href=\"http://example.com/test&\">http://example.com/test&</a>@<a class=\"tweet-url username\" href=\"http://twitter.com/chasesechrist\">chasesechrist</a>"
|
250
|
+
|
@@ -0,0 +1,193 @@
|
|
1
|
+
|
2
|
+
tests:
|
3
|
+
mentions:
|
4
|
+
- description: "Extract mention at the begining of a tweet"
|
5
|
+
text: "@username reply"
|
6
|
+
expected: ["username"]
|
7
|
+
|
8
|
+
- description: "Extract mention at the end of a tweet"
|
9
|
+
text: "mention @username"
|
10
|
+
expected: ["username"]
|
11
|
+
|
12
|
+
- description: "Extract mention in the middle of a tweet"
|
13
|
+
text: "mention @username in the middle"
|
14
|
+
expected: ["username"]
|
15
|
+
|
16
|
+
- description: "Extract mention of username with underscore"
|
17
|
+
text: "mention @user_name"
|
18
|
+
expected: ["user_name"]
|
19
|
+
|
20
|
+
- description: "Extract mention of all numeric username"
|
21
|
+
text: "mention @12345"
|
22
|
+
expected: ["12345"]
|
23
|
+
|
24
|
+
- description: "Extract mention or multiple usernames"
|
25
|
+
text: "mention @username1 @username2"
|
26
|
+
expected: ["username1", "username2"]
|
27
|
+
|
28
|
+
- description: "Extract mention in the middle of a Japanese tweet"
|
29
|
+
text: "の@usernameに到着を待っている"
|
30
|
+
expected: ["username"]
|
31
|
+
|
32
|
+
- description: "DO NOT extract username ending in @"
|
33
|
+
text: "Current Status: @_@ (cc: @username)"
|
34
|
+
expected: ["username"]
|
35
|
+
|
36
|
+
- description: "Extract lone metion but not @user@user (too close to an email)"
|
37
|
+
text: "@username email me @test@example.com"
|
38
|
+
expected: ["username"]
|
39
|
+
|
40
|
+
replies:
|
41
|
+
- description: "Extract reply at the begining of a tweet"
|
42
|
+
text: "@username reply"
|
43
|
+
expected: "username"
|
44
|
+
|
45
|
+
- description: "Extract reply preceded by only a space"
|
46
|
+
text: " @username reply"
|
47
|
+
expected: "username"
|
48
|
+
|
49
|
+
- description: "Extract reply preceded by only a full-width space (U+3000)"
|
50
|
+
text: " @username reply"
|
51
|
+
expected: "username"
|
52
|
+
|
53
|
+
- description: "DO NOT Extract reply when preceded by text"
|
54
|
+
text: "a @username mention, not a reply"
|
55
|
+
expected:
|
56
|
+
|
57
|
+
- description: "DO NOT Extract reply when preceded by ."
|
58
|
+
text: ".@username mention, not a reply"
|
59
|
+
expected:
|
60
|
+
|
61
|
+
- description: "DO NOT Extract reply when preceded by /"
|
62
|
+
text: "/@username mention, not a reply"
|
63
|
+
expected:
|
64
|
+
|
65
|
+
- description: "DO NOT Extract reply when preceded by _"
|
66
|
+
text: "_@username mention, not a reply"
|
67
|
+
expected:
|
68
|
+
|
69
|
+
- description: "DO NOT Extract reply when preceded by -"
|
70
|
+
text: "-@username mention, not a reply"
|
71
|
+
expected:
|
72
|
+
|
73
|
+
- description: "DO NOT Extract reply when preceded by +"
|
74
|
+
text: "+@username mention, not a reply"
|
75
|
+
expected:
|
76
|
+
|
77
|
+
- description: "DO NOT Extract reply when preceded by #"
|
78
|
+
text: "#@username mention, not a reply"
|
79
|
+
expected:
|
80
|
+
|
81
|
+
- description: "DO NOT Extract reply when preceded by !"
|
82
|
+
text: "!@username mention, not a reply"
|
83
|
+
expected:
|
84
|
+
|
85
|
+
- description: "DO NOT Extract reply when preceded by @"
|
86
|
+
text: "@@username mention, not a reply"
|
87
|
+
expected:
|
88
|
+
|
89
|
+
urls:
|
90
|
+
- description: "Extract a lone URL"
|
91
|
+
text: "http://example.com"
|
92
|
+
expected: ["http://example.com"]
|
93
|
+
|
94
|
+
- description: "Extract valid URL: http://google.com"
|
95
|
+
text: "text http://google.com"
|
96
|
+
expected: ["http://google.com"]
|
97
|
+
|
98
|
+
- description: "Extract valid URL: http://foobar.com/#"
|
99
|
+
text: "text http://foobar.com/#"
|
100
|
+
expected: ["http://foobar.com/#"]
|
101
|
+
|
102
|
+
- description: "Extract valid URL: http://google.com/#foo"
|
103
|
+
text: "text http://google.com/#foo"
|
104
|
+
expected: ["http://google.com/#foo"]
|
105
|
+
|
106
|
+
- description: "Extract valid URL: http://google.com/#search?q=iphone%20-filter%3Alinks"
|
107
|
+
text: "text http://google.com/#search?q=iphone%20-filter%3Alinks"
|
108
|
+
expected: ["http://google.com/#search?q=iphone%20-filter%3Alinks"]
|
109
|
+
|
110
|
+
- description: "Extract valid URL: http://twitter.com/#search?q=iphone%20-filter%3Alinks"
|
111
|
+
text: "text http://twitter.com/#search?q=iphone%20-filter%3Alinks"
|
112
|
+
expected: ["http://twitter.com/#search?q=iphone%20-filter%3Alinks"]
|
113
|
+
|
114
|
+
- description: "Extract valid URL: http://www.boingboing.net/2007/02/14/katamari_damacy_phon.html"
|
115
|
+
text: "text http://www.boingboing.net/2007/02/14/katamari_damacy_phon.html"
|
116
|
+
expected: ["http://www.boingboing.net/2007/02/14/katamari_damacy_phon.html"]
|
117
|
+
|
118
|
+
- description: "Extract valid URL: http://somehost.com:3000"
|
119
|
+
text: "text http://somehost.com:3000"
|
120
|
+
expected: ["http://somehost.com:3000"]
|
121
|
+
|
122
|
+
- description: "Extract valid URL: http://x.com/~matthew+%-x"
|
123
|
+
text: "text http://x.com/~matthew+%-x"
|
124
|
+
expected: ["http://x.com/~matthew+%-x"]
|
125
|
+
|
126
|
+
- description: "Extract valid URL: http://x.com/~matthew+%-,.;x"
|
127
|
+
text: "text http://x.com/~matthew+%-,.;x"
|
128
|
+
expected: ["http://x.com/~matthew+%-,.;x"]
|
129
|
+
|
130
|
+
- description: "Extract valid URL: http://x.com/,.;x"
|
131
|
+
text: "text http://x.com/,.;x"
|
132
|
+
expected: ["http://x.com/,.;x"]
|
133
|
+
|
134
|
+
- description: "Extract valid URL: http://en.wikipedia.org/wiki/Primer_(film)"
|
135
|
+
text: "text http://en.wikipedia.org/wiki/Primer_(film)"
|
136
|
+
expected: ["http://en.wikipedia.org/wiki/Primer_(film)"]
|
137
|
+
|
138
|
+
- description: "Extract valid URL: http://www.ams.org/bookstore-getitem/item=mbk-59"
|
139
|
+
text: "text http://www.ams.org/bookstore-getitem/item=mbk-59"
|
140
|
+
expected: ["http://www.ams.org/bookstore-getitem/item=mbk-59"]
|
141
|
+
|
142
|
+
- description: "Extract valid URL: http://✪df.ws/ejp"
|
143
|
+
text: "text http://✪df.ws/ejp"
|
144
|
+
expected: ["http://✪df.ws/ejp"]
|
145
|
+
|
146
|
+
- description: "Extract valid URL: http://chilp.it/?77e8fd"
|
147
|
+
text: "text http://chilp.it/?77e8fd"
|
148
|
+
expected: ["http://chilp.it/?77e8fd"]
|
149
|
+
|
150
|
+
- description: "DO NOT extract invalid URL: http://doman-dash_2314352345_dfasd.foo-cow_4352.com"
|
151
|
+
text: "text http://doman-dash_2314352345_dfasd.foo-cow_4352.com"
|
152
|
+
expected: []
|
153
|
+
|
154
|
+
- description: "DO NOT extract invalid URL: http://no-tld"
|
155
|
+
text: "text http://no-tld"
|
156
|
+
expected: []
|
157
|
+
|
158
|
+
- description: "DO NOT extract invalid URL: http://tld-too-short.x"
|
159
|
+
text: "text http://tld-too-short.x"
|
160
|
+
expected: []
|
161
|
+
|
162
|
+
hashtags:
|
163
|
+
- description: "Extract an all-alpha hashtag"
|
164
|
+
text: "a #hashtag here"
|
165
|
+
expected: ["hashtag"]
|
166
|
+
|
167
|
+
- description: "Extract a letter-then-number hashtag"
|
168
|
+
text: "this is #hashtag1"
|
169
|
+
expected: ["hashtag1"]
|
170
|
+
|
171
|
+
- description: "Extract a number-then-letter hashtag"
|
172
|
+
text: "#1hashtag is this"
|
173
|
+
expected: ["1hashtag"]
|
174
|
+
|
175
|
+
- description: "DO NOT Extract an all-numeric hashtag"
|
176
|
+
text: "On the #16 bus"
|
177
|
+
expected: []
|
178
|
+
|
179
|
+
- description: "Extract a hashtag containing ñ"
|
180
|
+
text: "I'll write more tests #mañana"
|
181
|
+
expected: ["mañana"]
|
182
|
+
|
183
|
+
- description: "Extract a hashtag containing é"
|
184
|
+
text: "Working remotely #café"
|
185
|
+
expected: ["café"]
|
186
|
+
|
187
|
+
- description: "Extract a hashtag containing ü"
|
188
|
+
text: "Getting my Oktoberfest on #münchen"
|
189
|
+
expected: ["münchen"]
|
190
|
+
|
191
|
+
- description: "DO NOT Extract a hashtag containing Japanese"
|
192
|
+
text: "this is not valid: # 会議中 ハッシュ"
|
193
|
+
expected: []
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tweetparser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Paul Battley
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2010-02-
|
12
|
+
date: 2010-02-24 00:00:00 +00:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
@@ -51,7 +51,11 @@ extensions: []
|
|
51
51
|
extra_rdoc_files: []
|
52
52
|
|
53
53
|
files:
|
54
|
+
- test/conformance_test.rb
|
54
55
|
- test/parser_test.rb
|
56
|
+
- test/twitter-text-conformance/autolink.yml
|
57
|
+
- test/twitter-text-conformance/extract.yml
|
58
|
+
- test/twitter-text-conformance/README
|
55
59
|
- lib/tweetparser/grammar.treetop
|
56
60
|
- lib/tweetparser.rb
|
57
61
|
has_rdoc: true
|