tweetparser 0.1.0 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,3 +1,15 @@
1
1
  require "treetop"
2
2
  require "polyglot"
3
3
  require "tweetparser/grammar"
4
+
5
+ module TweetParser
6
+ def self.parse(input)
7
+ kcode = $KCODE
8
+ $KCODE = "n"
9
+ parser = TweetContentParser.new
10
+ parsed = parser.parse(input)
11
+ $KCODE = kcode
12
+ return nil unless parsed
13
+ parsed.content
14
+ end
15
+ end
@@ -1,38 +1,66 @@
1
1
  grammar TweetContent
2
2
  rule tweet
3
- (url / html / space / newline / atref / hashtag / text)* {
3
+ (url / html / space / newline / list / username / hashtag / slash / text)* {
4
4
  def content
5
5
  elements.map{ |e| e.content }
6
6
  end
7
7
  }
8
8
  end
9
9
 
10
+ rule subdomain
11
+ ([a-zA-Z0-9\-] / [^\x20-\x7F])+
12
+ end
13
+
10
14
  rule url
11
- "http" "s"? "://" [\./a-zA-Z0-9\?#=\-_&%]+ {
15
+ (("http" / "HTTP") [sS]? "://" / "www." / "WWW.") subdomain ("." subdomain)+ ("/" [\.a-zA-Z0-9\?#=\-_&%]*)* {
12
16
  def content
13
17
  [:url, text_value]
14
18
  end
15
19
  }
16
20
  end
17
21
 
18
- rule atref
19
- "@" [a-zA-Z0-9_]+ {
22
+ rule name
23
+ [a-zA-Z0-9_]+
24
+ end
25
+
26
+ rule name_with_letters
27
+ [a-zA-Z] [a-zA-Z0-9_]* / [0-9_]+ [a-zA-Z] [a-zA-Z0-9_]*
28
+ end
29
+
30
+ rule username
31
+ ("@" / "@") name {
32
+ def content
33
+ [:username, text_value]
34
+ end
35
+ }
36
+ end
37
+
38
+ rule list
39
+ username "/" name {
20
40
  def content
21
- [:atref, text_value]
41
+ [:list, text_value]
22
42
  end
23
43
  }
24
44
  end
25
45
 
26
46
  rule hashtag
27
- "#" [a-zA-Z0-9_]+ {
47
+ ("#" / "#") name_with_letters {
28
48
  def content
29
49
  [:hashtag, text_value]
30
50
  end
31
51
  }
32
52
  end
33
53
 
54
+ rule slash
55
+ "/" name {
56
+ def content
57
+ [:slash, text_value]
58
+ end
59
+ }
60
+ end
61
+
34
62
  rule text
35
- ([^h\s] / "h" !("ttp" "s"? "://"))+ {
63
+ [\S]+ {
36
64
  def content
37
65
  [:text, text_value]
38
66
  end
@@ -50,13 +78,13 @@ grammar TweetContent
50
78
  rule newline
51
79
  "\r"? "\n" {
52
80
  def content
53
- [:newline]
81
+ [:newline, text_value]
54
82
  end
55
83
  }
56
84
  end
57
85
 
58
86
  rule space
59
- " "+ {
87
+ (" " / " ")+ {
60
88
  def content
61
89
  [:space, text_value]
62
90
  end
@@ -0,0 +1,48 @@
1
+ # encoding: UTF-8
2
+ $:.unshift(File.expand_path("../../lib", __FILE__))
3
+ require "test/unit"
4
+ require "shoulda"
5
+ require "tweetparser"
6
+ require "yaml"
7
+ require "cgi"
8
+
9
+ class AutolinkConformanceTest < Test::Unit::TestCase
10
+ DATA_PATH = File.expand_path("../twitter-text-conformance/autolink.yml", __FILE__)
11
+
12
+ def assert_autolink(expected, input)
13
+ sexpr = TweetParser.parse(input)
14
+ assert sexpr, "Failed to parse #{input}"
15
+ actual = sexpr.inject(""){ |output, (type, value)|
16
+ output <<
17
+ case type
18
+ when :username
19
+ at, username = value.scan(/^.|.*/)
20
+ %{#{at}<a class="tweet-url username" href="http://twitter.com/#{username}">#{username}</a>}
21
+ when :list
22
+ at, list = value.scan(/^.|.*/)
23
+ %{#{at}<a class="tweet-url list-slug" href=\"http://twitter.com/#{list}">#{list}</a>}
24
+ when :hashtag
25
+ hash, hashtag = value.scan(/^.|.*/)
26
+ %{<a href="http://twitter.com/search?q=%23#{hashtag}" }+
27
+ %{title="\##{hashtag}" class="tweet-url hashtag">#{value}</a>}
28
+ when :url
29
+ href = value
30
+ href = "http://" + value unless href =~ /^http/i
31
+ %{<a href="#{href}">#{value}</a>}
32
+ else
33
+ value
34
+ end
35
+ }
36
+ assert_equal expected, actual, sexpr.inspect
37
+ end
38
+
39
+ YAML.load(File.read(DATA_PATH))["tests"].each do |section, tests|
40
+ context "when testing #{section}" do
41
+ tests.each do |hash|
42
+ should hash["description"] do
43
+ assert_autolink hash["expected"], hash["text"]
44
+ end
45
+ end
46
+ end
47
+ end
48
+ end
@@ -6,12 +6,8 @@ require "tweetparser"
6
6
 
7
7
  class ParserTest < Test::Unit::TestCase
8
8
 
9
- def setup
10
- @parser = TweetContentParser.new
11
- end
12
-
13
9
  def assert_parses(expected, input)
14
- actual = @parser.parse(input).content
10
+ actual = TweetParser.parse(input)
15
11
  assert_equal expected, actual
16
12
  end
17
13
 
@@ -24,6 +20,21 @@ class ParserTest < Test::Unit::TestCase
24
20
  assert_parses [[:url, s]], s
25
21
  end
26
22
 
23
+ should "extract url with www and no http" do
24
+ s = "www.example.com/mail/?ui=2&shva=1#inbox"
25
+ assert_parses [[:url, s]], s
26
+ end
27
+
28
+ should "extract IDN url" do
29
+ s = "http://✪df.ws/ejp"
30
+ assert_parses [[:url, s]], s
31
+ end
32
+
33
+ should "not extract invalid domain" do
34
+ s = "http://example_com/mail/?ui=2&shva=1#inbox"
35
+ assert_parses [[:text, s]], s
36
+ end
37
+
27
38
  should "extract hashtag" do
28
39
  s = "#HashTag2010"
29
40
  assert_parses [[:hashtag, s]], s
@@ -31,7 +42,7 @@ class ParserTest < Test::Unit::TestCase
31
42
 
32
43
  should "extract at-references" do
33
44
  s = "@AtRef_3000"
34
- assert_parses [[:atref, s]], s
45
+ assert_parses [[:username, s]], s
35
46
  end
36
47
 
37
48
  should "extract HTML" do
@@ -39,9 +50,19 @@ class ParserTest < Test::Unit::TestCase
39
50
  assert_parses [[:html, s]], s
40
51
  end
41
52
 
53
+ should "extract a slash comment" do
54
+ s = %{/via}
55
+ assert_parses [[:slash, s]], s
56
+ end
57
+
58
+ should "extract a list" do
59
+ s = %{@username/list}
60
+ assert_parses [[:list, s]], s
61
+ end
62
+
42
63
  should "extract words spaces and new lines" do
43
64
  s = "this string\nhas spaces!"
44
- expected = [[:text, "this"], [:space, " "], [:text, "string"], [:newline],
65
+ expected = [[:text, "this"], [:space, " "], [:text, "string"], [:newline, "\n"],
45
66
  [:text, "has"], [:space, " "], [:text, "spaces!"]]
46
67
  assert_parses expected, s
47
68
  end
@@ -51,7 +72,7 @@ class ParserTest < Test::Unit::TestCase
51
72
  expected = [[:text, "Another"], [:space, " "], [:text, "test:"], [:space, " "],
52
73
  [:html, "<a href=\"http://twitpic.com/14vzny\" target=\"_blank\">"],
53
74
  [:html, "<img src=\"http://twitpic.com/show/mini/14vzny\" />"],
54
- [:html, "</a>"], [:newline],
75
+ [:html, "</a>"], [:newline, "\n"],
55
76
  [:url, "http://twitpic.com/14vzny"],
56
77
  [:space, " "], [:text, "3"], [:space, " "],
57
78
  [:url, "http://twitpic.com/14vzny"]]
@@ -60,7 +81,7 @@ class ParserTest < Test::Unit::TestCase
60
81
 
61
82
  should "extract elements from real-world sample" do
62
83
  s = %{RT @newsbrooke Tonight’s the night!: Hope you’ll all tune in tonight to watch On Expenses at 9pm on BBC4 http://bit.ly/cgbkmF #mps #uk}
63
- expected = [[:text, "RT"], [:space, " "], [:atref, "@newsbrooke"], [:space, " "],
84
+ expected = [[:text, "RT"], [:space, " "], [:username, "@newsbrooke"], [:space, " "],
64
85
  [:text, "Tonight’s"], [:space, " "], [:text, "the"], [:space, " "],
65
86
  [:text, "night!:"], [:space, " "], [:text, "Hope"], [:space, " "],
66
87
  [:text, "you’ll"], [:space, " "], [:text, "all"], [:space, " "],
@@ -0,0 +1,6 @@
1
+ Yaml files that define the conformance testing for twitter-text-* libraries.
2
+
3
+ == TODO
4
+
5
+ * Describe the format in the README
6
+ * Add more tests (ongoing)
@@ -0,0 +1,250 @@
1
+
2
+ tests:
3
+ usernames:
4
+ - description: "Autolink trailing username"
5
+ text: "text @username"
6
+ expected: "text @<a class=\"tweet-url username\" href=\"http://twitter.com/username\">username</a>"
7
+
8
+ - description: "Autolink username at the beginning"
9
+ text: "@username text"
10
+ expected: "@<a class=\"tweet-url username\" href=\"http://twitter.com/username\">username</a> text"
11
+
12
+ - description: "DO NOT Autolink username preceded by a letter"
13
+ text: "meet@the beach"
14
+ expected: "meet@the beach"
15
+
16
+ - description: "Autolink username preceded by puctuation"
17
+ text: "great.@username"
18
+ expected: "great.@<a class=\"tweet-url username\" href=\"http://twitter.com/username\">username</a>"
19
+
20
+ - description: "Autolink username followed by puctuation"
21
+ text: "@username&^$%^"
22
+ expected: "@<a class=\"tweet-url username\" href=\"http://twitter.com/username\">username</a>&^$%^"
23
+
24
+ - description: "Autolink username followed by Japanese"
25
+ text: "@usernameの"
26
+ expected: "@<a class=\"tweet-url username\" href=\"http://twitter.com/username\">username</a>の"
27
+
28
+ - description: "Autolink username preceded by Japanese"
29
+ text: "あ@username"
30
+ expected: "あ@<a class=\"tweet-url username\" href=\"http://twitter.com/username\">username</a>"
31
+
32
+ - description: "Autolink username surrounded by Japanese"
33
+ text: "あ@usernameの"
34
+ expected: "あ@<a class=\"tweet-url username\" href=\"http://twitter.com/username\">username</a>の"
35
+
36
+ - description: "Autolink username with full-width at sign (U+FF20)"
37
+ text: "@username"
38
+ expected: "@<a class=\"tweet-url username\" href=\"http://twitter.com/username\">username</a>"
39
+
40
+ - description: "DO NOT Autolink username over 20 characters"
41
+ text: "@username9012345678901"
42
+ expected: "@<a class=\"tweet-url username\" href=\"http://twitter.com/username901234567890\">username901234567890</a>1"
43
+ lists:
44
+ - description: "Autolink list preceded by a space"
45
+ text: "text @username/list"
46
+ expected: "text @<a class=\"tweet-url list-slug\" href=\"http://twitter.com/username/list\">username/list</a>"
47
+
48
+ - description: "DO NOT Autolink list when space follows slash"
49
+ text: "text @username/ list"
50
+ expected: "text @<a class=\"tweet-url username\" href=\"http://twitter.com/username\">username</a>/ list"
51
+
52
+ - description: "DO NOT Autolink list with empty username"
53
+ text: "text @/list"
54
+ expected: "text @/list"
55
+
56
+ - description: "Autolink list at the beginning"
57
+ text: "@username/list"
58
+ expected: "@<a class=\"tweet-url list-slug\" href=\"http://twitter.com/username/list\">username/list</a>"
59
+
60
+ - description: "DO NOT Autolink list preceded by letter"
61
+ text: "meet@the/beach"
62
+ expected: "meet@the/beach"
63
+
64
+ - description: "Autolink list preceded by puctuation"
65
+ text: "great.@username/list"
66
+ expected: "great.@<a class=\"tweet-url list-slug\" href=\"http://twitter.com/username/list\">username/list</a>"
67
+
68
+ - description: "Autolink list followed by puctuation"
69
+ text: "@username/list&^$%^"
70
+ expected: "@<a class=\"tweet-url list-slug\" href=\"http://twitter.com/username/list\">username/list</a>&^$%^"
71
+
72
+ - description: "Autolink list name over 80 characters (truncated to 80)"
73
+ text: "@username/list5678901234567890123456789012345678901234567890123456789012345678901234567890A"
74
+ expected: "@<a class=\"tweet-url list-slug\" href=\"http://twitter.com/username/list5678901234567890123456789012345678901234567890123456789012345678901234567890\">username/list5678901234567890123456789012345678901234567890123456789012345678901234567890</a>A"
75
+
76
+ hashtags:
77
+ - description: "Autolink trailing hashtag"
78
+ text: "text #hashtag"
79
+ expected: "text <a href=\"http://twitter.com/search?q=%23hashtag\" title=\"#hashtag\" class=\"tweet-url hashtag\">#hashtag</a>"
80
+
81
+ - description: "Autolink alphanumeric hashtag (letter-number-letter)"
82
+ text: "text #hash0tag"
83
+ expected: "text <a href=\"http://twitter.com/search?q=%23hash0tag\" title=\"#hash0tag\" class=\"tweet-url hashtag\">#hash0tag</a>"
84
+
85
+ - description: "Autolink alphanumeric hashtag (number-letter)"
86
+ text: "text #1tag"
87
+ expected: "text <a href=\"http://twitter.com/search?q=%231tag\" title=\"#1tag\" class=\"tweet-url hashtag\">#1tag</a>"
88
+
89
+ - description: "Autolink hashtag with underscore"
90
+ text: "text #hash_tag"
91
+ expected: "text <a href=\"http://twitter.com/search?q=%23hash_tag\" title=\"#hash_tag\" class=\"tweet-url hashtag\">#hash_tag</a>"
92
+
93
+ - description: "DO NOT Autolink all-numeric hashtags"
94
+ text: "text #1234"
95
+ expected: "text #1234"
96
+
97
+ - description: "DO NOT Autolink hashtag preceded by a letter"
98
+ text: "text#hashtag"
99
+ expected: "text#hashtag"
100
+
101
+ - description: "Autolink multiple hashtags"
102
+ text: "text #hashtag1 #hashtag2"
103
+ expected: "text <a href=\"http://twitter.com/search?q=%23hashtag1\" title=\"#hashtag1\" class=\"tweet-url hashtag\">#hashtag1</a> <a href=\"http://twitter.com/search?q=%23hashtag2\" title=\"#hashtag2\" class=\"tweet-url hashtag\">#hashtag2</a>"
104
+
105
+ - description: "Autolink hashtag preceded by a period"
106
+ text: "text.#hashtag"
107
+ expected: "text.<a href=\"http://twitter.com/search?q=%23hashtag\" title=\"#hashtag\" class=\"tweet-url hashtag\">#hashtag</a>"
108
+
109
+ - description: "DO NOT Autolink hashtag preceded by &"
110
+ text: "&#nbsp;"
111
+ expected: "&#nbsp;"
112
+
113
+ - description: "Autolink hashtag followed by ! (! not included)"
114
+ text: "text #hashtag!"
115
+ expected: "text <a href=\"http://twitter.com/search?q=%23hashtag\" title=\"#hashtag\" class=\"tweet-url hashtag\">#hashtag</a>!"
116
+
117
+ - description: "Autolink hashtag followed by Japanese"
118
+ text: "text #hashtagの"
119
+ expected: "text <a href=\"http://twitter.com/search?q=%23hashtag\" title=\"#hashtag\" class=\"tweet-url hashtag\">#hashtag</a>の"
120
+
121
+ - description: "Autolink hashtag preceded by full-width space (U+3000)"
122
+ text: "text #hashtag"
123
+ expected: "text <a href=\"http://twitter.com/search?q=%23hashtag\" title=\"#hashtag\" class=\"tweet-url hashtag\">#hashtag</a>"
124
+
125
+ - description: "Autolink hashtag followed by full-width space (U+3000)"
126
+ text: "#hashtag text"
127
+ expected: "<a href=\"http://twitter.com/search?q=%23hashtag\" title=\"#hashtag\" class=\"tweet-url hashtag\">#hashtag</a> text"
128
+
129
+ - description: "Autolink hashtag with full-width hash (U+FF03)"
130
+ text: "#hashtag"
131
+ expected: "<a href=\"http://twitter.com/search?q=%23hashtag\" title=\"#hashtag\" class=\"tweet-url hashtag\">#hashtag</a>"
132
+
133
+ urls:
134
+ - description: "Autolink trailing url"
135
+ text: "text http://example.com"
136
+ expected: "text <a href=\"http://example.com\">http://example.com</a>"
137
+
138
+ - description: "Autolink url in mid-text"
139
+ text: "text http://example.com more text"
140
+ expected: "text <a href=\"http://example.com\">http://example.com</a> more text"
141
+
142
+ - description: "Autolink url in Japanese text"
143
+ text: "いまなにしてるhttp://example.comいまなにしてる"
144
+ expected: "いまなにしてる<a href=\"http://example.com\">http://example.com</a>いまなにしてる"
145
+
146
+ - description: "Autolink url surrounded by parentheses"
147
+ text: "text (http://example.com)"
148
+ expected: "text (<a href=\"http://example.com\">http://example.com</a>)"
149
+
150
+ - description: "Autolink url containing unicode characters"
151
+ text: "I enjoy Macintosh Brand computers: http://✪df.ws/ejp"
152
+ expected: "I enjoy Macintosh Brand computers: <a href=\"http://✪df.ws/ejp\">http://✪df.ws/ejp</a>"
153
+
154
+ - description: "DO NOT Autolink url containing ! character in the domain"
155
+ text: "badly formatted http://foo!bar.com"
156
+ expected: "badly formatted http://foo!bar.com"
157
+
158
+ - description: "DO NOT Autolink url containing _ character in the domain"
159
+ text: "badly formatted http://foo_bar.com"
160
+ expected: "badly formatted http://foo_bar.com"
161
+
162
+ - description: "Autolink url preceded by :"
163
+ text: "text:http://example.com"
164
+ expected: "text:<a href=\"http://example.com\">http://example.com</a>"
165
+
166
+ - description: "Autolink url followed by ? (without it)"
167
+ text: "text http://example.com?"
168
+ expected: "text <a href=\"http://example.com\">http://example.com</a>?"
169
+
170
+ - description: "Autolink url followed by ! (without it)"
171
+ text: "text http://example.com!"
172
+ expected: "text <a href=\"http://example.com\">http://example.com</a>!"
173
+
174
+ - description: "Autolink url followed by , (without it)"
175
+ text: "text http://example.com,"
176
+ expected: "text <a href=\"http://example.com\">http://example.com</a>,"
177
+
178
+ - description: "Autolink url followed by . (without it)"
179
+ text: "text http://example.com."
180
+ expected: "text <a href=\"http://example.com\">http://example.com</a>."
181
+
182
+ - description: "Autolink url followed by : (without it)"
183
+ text: "text http://example.com:"
184
+ expected: "text <a href=\"http://example.com\">http://example.com</a>:"
185
+
186
+ - description: "Autolink url followed by ; (without it)"
187
+ text: "text http://example.com;"
188
+ expected: "text <a href=\"http://example.com\">http://example.com</a>;"
189
+
190
+ - description: "Autolink url followed by ] (without it)"
191
+ text: "text http://example.com]"
192
+ expected: "text <a href=\"http://example.com\">http://example.com</a>]"
193
+
194
+ - description: "Autolink url followed by ) (without it)"
195
+ text: "text http://example.com)"
196
+ expected: "text <a href=\"http://example.com\">http://example.com</a>)"
197
+
198
+ - description: "Autolink url followed by } (without it)"
199
+ text: "text http://example.com}"
200
+ expected: "text <a href=\"http://example.com\">http://example.com</a>}"
201
+
202
+ - description: "Autolink url followed by = (without it)"
203
+ text: "text http://example.com="
204
+ expected: "text <a href=\"http://example.com\">http://example.com</a>="
205
+
206
+ - description: "Autolink url followed by ' (without it)"
207
+ text: "text http://example.com'"
208
+ expected: "text <a href=\"http://example.com\">http://example.com</a>'"
209
+
210
+ - description: "DO NOT Autolink url preceded by '"
211
+ text: "text 'http://example.com"
212
+ expected: "text 'http://example.com"
213
+
214
+ - description: "DO NOT Autolink url preceded by /"
215
+ text: "text /http://example.com"
216
+ expected: "text /http://example.com"
217
+
218
+ - description: "DO NOT Autolink url preceded by !"
219
+ text: "text !http://example.com"
220
+ expected: "text !http://example.com"
221
+
222
+ - description: "DO NOT Autolink url preceded by ="
223
+ text: "text =http://example.com"
224
+ expected: "text =http://example.com"
225
+
226
+ - description: "Autolink url embedded in link tag"
227
+ text: "<link rel='true'>http://example.com</link>"
228
+ expected: "<link rel='true'><a href=\"http://example.com\">http://example.com</a></link>"
229
+
230
+ - description: "Autolink multiple urls"
231
+ text: "http://example.com https://sslexample.com http://sub.example.com"
232
+ expected: "<a href=\"http://example.com\">http://example.com</a> <a href=\"https://sslexample.com\">https://sslexample.com</a> <a href=\"http://sub.example.com\">http://sub.example.com</a>"
233
+
234
+ - description: "Autolink url with long TLD"
235
+ text: "http://example.mobi/path"
236
+ expected: "<a href=\"http://example.mobi/path\">http://example.mobi/path</a>"
237
+
238
+ - description: "Autolink url without protocol (with www)"
239
+ text: "www.example.com"
240
+ expected: "<a href=\"http://www.example.com\">www.example.com</a>"
241
+
242
+ - description: "Autolink url without protocol (with WWW)"
243
+ text: "WWW.EXAMPLE.COM"
244
+ expected: "<a href=\"http://WWW.EXAMPLE.COM\">WWW.EXAMPLE.COM</a>"
245
+
246
+ all:
247
+ - description: "Autolink url does not overlap @username"
248
+ text: "Check out: http://example.com/test&@chasesechrist"
249
+ expected: "Check out: <a href=\"http://example.com/test&\">http://example.com/test&</a>@<a class=\"tweet-url username\" href=\"http://twitter.com/chasesechrist\">chasesechrist</a>"
250
+
@@ -0,0 +1,193 @@
1
+
2
+ tests:
3
+ mentions:
4
+ - description: "Extract mention at the begining of a tweet"
5
+ text: "@username reply"
6
+ expected: ["username"]
7
+
8
+ - description: "Extract mention at the end of a tweet"
9
+ text: "mention @username"
10
+ expected: ["username"]
11
+
12
+ - description: "Extract mention in the middle of a tweet"
13
+ text: "mention @username in the middle"
14
+ expected: ["username"]
15
+
16
+ - description: "Extract mention of username with underscore"
17
+ text: "mention @user_name"
18
+ expected: ["user_name"]
19
+
20
+ - description: "Extract mention of all numeric username"
21
+ text: "mention @12345"
22
+ expected: ["12345"]
23
+
24
+ - description: "Extract mention or multiple usernames"
25
+ text: "mention @username1 @username2"
26
+ expected: ["username1", "username2"]
27
+
28
+ - description: "Extract mention in the middle of a Japanese tweet"
29
+ text: "の@usernameに到着を待っている"
30
+ expected: ["username"]
31
+
32
+ - description: "DO NOT extract username ending in @"
33
+ text: "Current Status: @_@ (cc: @username)"
34
+ expected: ["username"]
35
+
36
+ - description: "Extract lone metion but not @user@user (too close to an email)"
37
+ text: "@username email me @test@example.com"
38
+ expected: ["username"]
39
+
40
+ replies:
41
+ - description: "Extract reply at the begining of a tweet"
42
+ text: "@username reply"
43
+ expected: "username"
44
+
45
+ - description: "Extract reply preceded by only a space"
46
+ text: " @username reply"
47
+ expected: "username"
48
+
49
+ - description: "Extract reply preceded by only a full-width space (U+3000)"
50
+ text: " @username reply"
51
+ expected: "username"
52
+
53
+ - description: "DO NOT Extract reply when preceded by text"
54
+ text: "a @username mention, not a reply"
55
+ expected:
56
+
57
+ - description: "DO NOT Extract reply when preceded by ."
58
+ text: ".@username mention, not a reply"
59
+ expected:
60
+
61
+ - description: "DO NOT Extract reply when preceded by /"
62
+ text: "/@username mention, not a reply"
63
+ expected:
64
+
65
+ - description: "DO NOT Extract reply when preceded by _"
66
+ text: "_@username mention, not a reply"
67
+ expected:
68
+
69
+ - description: "DO NOT Extract reply when preceded by -"
70
+ text: "-@username mention, not a reply"
71
+ expected:
72
+
73
+ - description: "DO NOT Extract reply when preceded by +"
74
+ text: "+@username mention, not a reply"
75
+ expected:
76
+
77
+ - description: "DO NOT Extract reply when preceded by #"
78
+ text: "#@username mention, not a reply"
79
+ expected:
80
+
81
+ - description: "DO NOT Extract reply when preceded by !"
82
+ text: "!@username mention, not a reply"
83
+ expected:
84
+
85
+ - description: "DO NOT Extract reply when preceded by @"
86
+ text: "@@username mention, not a reply"
87
+ expected:
88
+
89
+ urls:
90
+ - description: "Extract a lone URL"
91
+ text: "http://example.com"
92
+ expected: ["http://example.com"]
93
+
94
+ - description: "Extract valid URL: http://google.com"
95
+ text: "text http://google.com"
96
+ expected: ["http://google.com"]
97
+
98
+ - description: "Extract valid URL: http://foobar.com/#"
99
+ text: "text http://foobar.com/#"
100
+ expected: ["http://foobar.com/#"]
101
+
102
+ - description: "Extract valid URL: http://google.com/#foo"
103
+ text: "text http://google.com/#foo"
104
+ expected: ["http://google.com/#foo"]
105
+
106
+ - description: "Extract valid URL: http://google.com/#search?q=iphone%20-filter%3Alinks"
107
+ text: "text http://google.com/#search?q=iphone%20-filter%3Alinks"
108
+ expected: ["http://google.com/#search?q=iphone%20-filter%3Alinks"]
109
+
110
+ - description: "Extract valid URL: http://twitter.com/#search?q=iphone%20-filter%3Alinks"
111
+ text: "text http://twitter.com/#search?q=iphone%20-filter%3Alinks"
112
+ expected: ["http://twitter.com/#search?q=iphone%20-filter%3Alinks"]
113
+
114
+ - description: "Extract valid URL: http://www.boingboing.net/2007/02/14/katamari_damacy_phon.html"
115
+ text: "text http://www.boingboing.net/2007/02/14/katamari_damacy_phon.html"
116
+ expected: ["http://www.boingboing.net/2007/02/14/katamari_damacy_phon.html"]
117
+
118
+ - description: "Extract valid URL: http://somehost.com:3000"
119
+ text: "text http://somehost.com:3000"
120
+ expected: ["http://somehost.com:3000"]
121
+
122
+ - description: "Extract valid URL: http://x.com/~matthew+%-x"
123
+ text: "text http://x.com/~matthew+%-x"
124
+ expected: ["http://x.com/~matthew+%-x"]
125
+
126
+ - description: "Extract valid URL: http://x.com/~matthew+%-,.;x"
127
+ text: "text http://x.com/~matthew+%-,.;x"
128
+ expected: ["http://x.com/~matthew+%-,.;x"]
129
+
130
+ - description: "Extract valid URL: http://x.com/,.;x"
131
+ text: "text http://x.com/,.;x"
132
+ expected: ["http://x.com/,.;x"]
133
+
134
+ - description: "Extract valid URL: http://en.wikipedia.org/wiki/Primer_(film)"
135
+ text: "text http://en.wikipedia.org/wiki/Primer_(film)"
136
+ expected: ["http://en.wikipedia.org/wiki/Primer_(film)"]
137
+
138
+ - description: "Extract valid URL: http://www.ams.org/bookstore-getitem/item=mbk-59"
139
+ text: "text http://www.ams.org/bookstore-getitem/item=mbk-59"
140
+ expected: ["http://www.ams.org/bookstore-getitem/item=mbk-59"]
141
+
142
+ - description: "Extract valid URL: http://✪df.ws/ejp"
143
+ text: "text http://✪df.ws/ejp"
144
+ expected: ["http://✪df.ws/ejp"]
145
+
146
+ - description: "Extract valid URL: http://chilp.it/?77e8fd"
147
+ text: "text http://chilp.it/?77e8fd"
148
+ expected: ["http://chilp.it/?77e8fd"]
149
+
150
+ - description: "DO NOT extract invalid URL: http://doman-dash_2314352345_dfasd.foo-cow_4352.com"
151
+ text: "text http://doman-dash_2314352345_dfasd.foo-cow_4352.com"
152
+ expected: []
153
+
154
+ - description: "DO NOT extract invalid URL: http://no-tld"
155
+ text: "text http://no-tld"
156
+ expected: []
157
+
158
+ - description: "DO NOT extract invalid URL: http://tld-too-short.x"
159
+ text: "text http://tld-too-short.x"
160
+ expected: []
161
+
162
+ hashtags:
163
+ - description: "Extract an all-alpha hashtag"
164
+ text: "a #hashtag here"
165
+ expected: ["hashtag"]
166
+
167
+ - description: "Extract a letter-then-number hashtag"
168
+ text: "this is #hashtag1"
169
+ expected: ["hashtag1"]
170
+
171
+ - description: "Extract a number-then-letter hashtag"
172
+ text: "#1hashtag is this"
173
+ expected: ["1hashtag"]
174
+
175
+ - description: "DO NOT Extract an all-numeric hashtag"
176
+ text: "On the #16 bus"
177
+ expected: []
178
+
179
+ - description: "Extract a hashtag containing ñ"
180
+ text: "I'll write more tests #mañana"
181
+ expected: ["mañana"]
182
+
183
+ - description: "Extract a hashtag containing é"
184
+ text: "Working remotely #café"
185
+ expected: ["café"]
186
+
187
+ - description: "Extract a hashtag containing ü"
188
+ text: "Getting my Oktoberfest on #münchen"
189
+ expected: ["münchen"]
190
+
191
+ - description: "DO NOT Extract a hashtag containing Japanese"
192
+ text: "this is not valid: # 会議中 ハッシュ"
193
+ expected: []
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tweetparser
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Paul Battley
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2010-02-23 00:00:00 +00:00
12
+ date: 2010-02-24 00:00:00 +00:00
13
13
  default_executable:
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
@@ -51,7 +51,11 @@ extensions: []
51
51
  extra_rdoc_files: []
52
52
 
53
53
  files:
54
+ - test/conformance_test.rb
54
55
  - test/parser_test.rb
56
+ - test/twitter-text-conformance/autolink.yml
57
+ - test/twitter-text-conformance/extract.yml
58
+ - test/twitter-text-conformance/README
55
59
  - lib/tweetparser/grammar.treetop
56
60
  - lib/tweetparser.rb
57
61
  has_rdoc: true