tweetparser 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,3 +1,15 @@
1
1
  require "treetop"
2
2
  require "polyglot"
3
3
  require "tweetparser/grammar"
4
+
5
+ module TweetParser
6
+ def self.parse(input)
7
+ kcode = $KCODE
8
+ $KCODE = "n"
9
+ parser = TweetContentParser.new
10
+ parsed = parser.parse(input)
11
+ $KCODE = kcode
12
+ return nil unless parsed
13
+ parsed.content
14
+ end
15
+ end
@@ -1,38 +1,66 @@
1
1
  grammar TweetContent
2
2
  rule tweet
3
- (url / html / space / newline / atref / hashtag / text)* {
3
+ (url / html / space / newline / list / username / hashtag / slash / text)* {
4
4
  def content
5
5
  elements.map{ |e| e.content }
6
6
  end
7
7
  }
8
8
  end
9
9
 
10
+ rule subdomain
11
+ ([a-zA-Z0-9\-] / [^\x20-\x7F])+
12
+ end
13
+
10
14
  rule url
11
- "http" "s"? "://" [\./a-zA-Z0-9\?#=\-_&%]+ {
15
+ (("http" / "HTTP") [sS]? "://" / "www." / "WWW.") subdomain ("." subdomain)+ ("/" [\.a-zA-Z0-9\?#=\-_&%]*)* {
12
16
  def content
13
17
  [:url, text_value]
14
18
  end
15
19
  }
16
20
  end
17
21
 
18
- rule atref
19
- "@" [a-zA-Z0-9_]+ {
22
+ rule name
23
+ [a-zA-Z0-9_]+
24
+ end
25
+
26
+ rule name_with_letters
27
+ [a-zA-Z] [a-zA-Z0-9_]* / [0-9_]+ [a-zA-Z] [a-zA-Z0-9_]*
28
+ end
29
+
30
+ rule username
31
+ ("@" / "@") name {
32
+ def content
33
+ [:username, text_value]
34
+ end
35
+ }
36
+ end
37
+
38
+ rule list
39
+ username "/" name {
20
40
  def content
21
- [:atref, text_value]
41
+ [:list, text_value]
22
42
  end
23
43
  }
24
44
  end
25
45
 
26
46
  rule hashtag
27
- "#" [a-zA-Z0-9_]+ {
47
+ ("#" / "#") name_with_letters {
28
48
  def content
29
49
  [:hashtag, text_value]
30
50
  end
31
51
  }
32
52
  end
33
53
 
54
+ rule slash
55
+ "/" name {
56
+ def content
57
+ [:slash, text_value]
58
+ end
59
+ }
60
+ end
61
+
34
62
  rule text
35
- ([^h\s] / "h" !("ttp" "s"? "://"))+ {
63
+ [\S]+ {
36
64
  def content
37
65
  [:text, text_value]
38
66
  end
@@ -50,13 +78,13 @@ grammar TweetContent
50
78
  rule newline
51
79
  "\r"? "\n" {
52
80
  def content
53
- [:newline]
81
+ [:newline, text_value]
54
82
  end
55
83
  }
56
84
  end
57
85
 
58
86
  rule space
59
- " "+ {
87
+ (" " / " ")+ {
60
88
  def content
61
89
  [:space, text_value]
62
90
  end
@@ -0,0 +1,48 @@
1
+ # encoding: UTF-8
2
+ $:.unshift(File.expand_path("../../lib", __FILE__))
3
+ require "test/unit"
4
+ require "shoulda"
5
+ require "tweetparser"
6
+ require "yaml"
7
+ require "cgi"
8
+
9
+ class AutolinkConformanceTest < Test::Unit::TestCase
10
+ DATA_PATH = File.expand_path("../twitter-text-conformance/autolink.yml", __FILE__)
11
+
12
+ def assert_autolink(expected, input)
13
+ sexpr = TweetParser.parse(input)
14
+ assert sexpr, "Failed to parse #{input}"
15
+ actual = sexpr.inject(""){ |output, (type, value)|
16
+ output <<
17
+ case type
18
+ when :username
19
+ at, username = value.scan(/^.|.*/)
20
+ %{#{at}<a class="tweet-url username" href="http://twitter.com/#{username}">#{username}</a>}
21
+ when :list
22
+ at, list = value.scan(/^.|.*/)
23
+ %{#{at}<a class="tweet-url list-slug" href=\"http://twitter.com/#{list}">#{list}</a>}
24
+ when :hashtag
25
+ hash, hashtag = value.scan(/^.|.*/)
26
+ %{<a href="http://twitter.com/search?q=%23#{hashtag}" }+
27
+ %{title="\##{hashtag}" class="tweet-url hashtag">#{value}</a>}
28
+ when :url
29
+ href = value
30
+ href = "http://" + value unless href =~ /^http/i
31
+ %{<a href="#{href}">#{value}</a>}
32
+ else
33
+ value
34
+ end
35
+ }
36
+ assert_equal expected, actual, sexpr.inspect
37
+ end
38
+
39
+ YAML.load(File.read(DATA_PATH))["tests"].each do |section, tests|
40
+ context "when testing #{section}" do
41
+ tests.each do |hash|
42
+ should hash["description"] do
43
+ assert_autolink hash["expected"], hash["text"]
44
+ end
45
+ end
46
+ end
47
+ end
48
+ end
@@ -6,12 +6,8 @@ require "tweetparser"
6
6
 
7
7
  class ParserTest < Test::Unit::TestCase
8
8
 
9
- def setup
10
- @parser = TweetContentParser.new
11
- end
12
-
13
9
  def assert_parses(expected, input)
14
- actual = @parser.parse(input).content
10
+ actual = TweetParser.parse(input)
15
11
  assert_equal expected, actual
16
12
  end
17
13
 
@@ -24,6 +20,21 @@ class ParserTest < Test::Unit::TestCase
24
20
  assert_parses [[:url, s]], s
25
21
  end
26
22
 
23
+ should "extract url with www and no http" do
24
+ s = "www.example.com/mail/?ui=2&shva=1#inbox"
25
+ assert_parses [[:url, s]], s
26
+ end
27
+
28
+ should "extract IDN url" do
29
+ s = "http://✪df.ws/ejp"
30
+ assert_parses [[:url, s]], s
31
+ end
32
+
33
+ should "not extract invalid domain" do
34
+ s = "http://example_com/mail/?ui=2&shva=1#inbox"
35
+ assert_parses [[:text, s]], s
36
+ end
37
+
27
38
  should "extract hashtag" do
28
39
  s = "#HashTag2010"
29
40
  assert_parses [[:hashtag, s]], s
@@ -31,7 +42,7 @@ class ParserTest < Test::Unit::TestCase
31
42
 
32
43
  should "extract at-references" do
33
44
  s = "@AtRef_3000"
34
- assert_parses [[:atref, s]], s
45
+ assert_parses [[:username, s]], s
35
46
  end
36
47
 
37
48
  should "extract HTML" do
@@ -39,9 +50,19 @@ class ParserTest < Test::Unit::TestCase
39
50
  assert_parses [[:html, s]], s
40
51
  end
41
52
 
53
+ should "extract a slash comment" do
54
+ s = %{/via}
55
+ assert_parses [[:slash, s]], s
56
+ end
57
+
58
+ should "extract a list" do
59
+ s = %{@username/list}
60
+ assert_parses [[:list, s]], s
61
+ end
62
+
42
63
  should "extract words spaces and new lines" do
43
64
  s = "this string\nhas spaces!"
44
- expected = [[:text, "this"], [:space, " "], [:text, "string"], [:newline],
65
+ expected = [[:text, "this"], [:space, " "], [:text, "string"], [:newline, "\n"],
45
66
  [:text, "has"], [:space, " "], [:text, "spaces!"]]
46
67
  assert_parses expected, s
47
68
  end
@@ -51,7 +72,7 @@ class ParserTest < Test::Unit::TestCase
51
72
  expected = [[:text, "Another"], [:space, " "], [:text, "test:"], [:space, " "],
52
73
  [:html, "<a href=\"http://twitpic.com/14vzny\" target=\"_blank\">"],
53
74
  [:html, "<img src=\"http://twitpic.com/show/mini/14vzny\" />"],
54
- [:html, "</a>"], [:newline],
75
+ [:html, "</a>"], [:newline, "\n"],
55
76
  [:url, "http://twitpic.com/14vzny"],
56
77
  [:space, " "], [:text, "3"], [:space, " "],
57
78
  [:url, "http://twitpic.com/14vzny"]]
@@ -60,7 +81,7 @@ class ParserTest < Test::Unit::TestCase
60
81
 
61
82
  should "extract elements from real-world sample" do
62
83
  s = %{RT @newsbrooke Tonight’s the night!: Hope you’ll all tune in tonight to watch On Expenses at 9pm on BBC4 http://bit.ly/cgbkmF #mps #uk}
63
- expected = [[:text, "RT"], [:space, " "], [:atref, "@newsbrooke"], [:space, " "],
84
+ expected = [[:text, "RT"], [:space, " "], [:username, "@newsbrooke"], [:space, " "],
64
85
  [:text, "Tonight’s"], [:space, " "], [:text, "the"], [:space, " "],
65
86
  [:text, "night!:"], [:space, " "], [:text, "Hope"], [:space, " "],
66
87
  [:text, "you’ll"], [:space, " "], [:text, "all"], [:space, " "],
@@ -0,0 +1,6 @@
1
+ Yaml files that define the conformance testing for twitter-text-* libraries.
2
+
3
+ == TODO
4
+
5
+ * Describe the format in the README
6
+ * Add more tests (ongoing)
@@ -0,0 +1,250 @@
1
+
2
+ tests:
3
+ usernames:
4
+ - description: "Autolink trailing username"
5
+ text: "text @username"
6
+ expected: "text @<a class=\"tweet-url username\" href=\"http://twitter.com/username\">username</a>"
7
+
8
+ - description: "Autolink username at the beginning"
9
+ text: "@username text"
10
+ expected: "@<a class=\"tweet-url username\" href=\"http://twitter.com/username\">username</a> text"
11
+
12
+ - description: "DO NOT Autolink username preceded by a letter"
13
+ text: "meet@the beach"
14
+ expected: "meet@the beach"
15
+
16
+ - description: "Autolink username preceded by puctuation"
17
+ text: "great.@username"
18
+ expected: "great.@<a class=\"tweet-url username\" href=\"http://twitter.com/username\">username</a>"
19
+
20
+ - description: "Autolink username followed by puctuation"
21
+ text: "@username&^$%^"
22
+ expected: "@<a class=\"tweet-url username\" href=\"http://twitter.com/username\">username</a>&^$%^"
23
+
24
+ - description: "Autolink username followed by Japanese"
25
+ text: "@usernameの"
26
+ expected: "@<a class=\"tweet-url username\" href=\"http://twitter.com/username\">username</a>の"
27
+
28
+ - description: "Autolink username preceded by Japanese"
29
+ text: "あ@username"
30
+ expected: "あ@<a class=\"tweet-url username\" href=\"http://twitter.com/username\">username</a>"
31
+
32
+ - description: "Autolink username surrounded by Japanese"
33
+ text: "あ@usernameの"
34
+ expected: "あ@<a class=\"tweet-url username\" href=\"http://twitter.com/username\">username</a>の"
35
+
36
+ - description: "Autolink username with full-width at sign (U+FF20)"
37
+ text: "@username"
38
+ expected: "@<a class=\"tweet-url username\" href=\"http://twitter.com/username\">username</a>"
39
+
40
+ - description: "DO NOT Autolink username over 20 characters"
41
+ text: "@username9012345678901"
42
+ expected: "@<a class=\"tweet-url username\" href=\"http://twitter.com/username901234567890\">username901234567890</a>1"
43
+ lists:
44
+ - description: "Autolink list preceded by a space"
45
+ text: "text @username/list"
46
+ expected: "text @<a class=\"tweet-url list-slug\" href=\"http://twitter.com/username/list\">username/list</a>"
47
+
48
+ - description: "DO NOT Autolink list when space follows slash"
49
+ text: "text @username/ list"
50
+ expected: "text @<a class=\"tweet-url username\" href=\"http://twitter.com/username\">username</a>/ list"
51
+
52
+ - description: "DO NOT Autolink list with empty username"
53
+ text: "text @/list"
54
+ expected: "text @/list"
55
+
56
+ - description: "Autolink list at the beginning"
57
+ text: "@username/list"
58
+ expected: "@<a class=\"tweet-url list-slug\" href=\"http://twitter.com/username/list\">username/list</a>"
59
+
60
+ - description: "DO NOT Autolink list preceded by letter"
61
+ text: "meet@the/beach"
62
+ expected: "meet@the/beach"
63
+
64
+ - description: "Autolink list preceded by puctuation"
65
+ text: "great.@username/list"
66
+ expected: "great.@<a class=\"tweet-url list-slug\" href=\"http://twitter.com/username/list\">username/list</a>"
67
+
68
+ - description: "Autolink list followed by puctuation"
69
+ text: "@username/list&^$%^"
70
+ expected: "@<a class=\"tweet-url list-slug\" href=\"http://twitter.com/username/list\">username/list</a>&^$%^"
71
+
72
+ - description: "Autolink list name over 80 characters (truncated to 80)"
73
+ text: "@username/list5678901234567890123456789012345678901234567890123456789012345678901234567890A"
74
+ expected: "@<a class=\"tweet-url list-slug\" href=\"http://twitter.com/username/list5678901234567890123456789012345678901234567890123456789012345678901234567890\">username/list5678901234567890123456789012345678901234567890123456789012345678901234567890</a>A"
75
+
76
+ hashtags:
77
+ - description: "Autolink trailing hashtag"
78
+ text: "text #hashtag"
79
+ expected: "text <a href=\"http://twitter.com/search?q=%23hashtag\" title=\"#hashtag\" class=\"tweet-url hashtag\">#hashtag</a>"
80
+
81
+ - description: "Autolink alphanumeric hashtag (letter-number-letter)"
82
+ text: "text #hash0tag"
83
+ expected: "text <a href=\"http://twitter.com/search?q=%23hash0tag\" title=\"#hash0tag\" class=\"tweet-url hashtag\">#hash0tag</a>"
84
+
85
+ - description: "Autolink alphanumeric hashtag (number-letter)"
86
+ text: "text #1tag"
87
+ expected: "text <a href=\"http://twitter.com/search?q=%231tag\" title=\"#1tag\" class=\"tweet-url hashtag\">#1tag</a>"
88
+
89
+ - description: "Autolink hashtag with underscore"
90
+ text: "text #hash_tag"
91
+ expected: "text <a href=\"http://twitter.com/search?q=%23hash_tag\" title=\"#hash_tag\" class=\"tweet-url hashtag\">#hash_tag</a>"
92
+
93
+ - description: "DO NOT Autolink all-numeric hashtags"
94
+ text: "text #1234"
95
+ expected: "text #1234"
96
+
97
+ - description: "DO NOT Autolink hashtag preceded by a letter"
98
+ text: "text#hashtag"
99
+ expected: "text#hashtag"
100
+
101
+ - description: "Autolink multiple hashtags"
102
+ text: "text #hashtag1 #hashtag2"
103
+ expected: "text <a href=\"http://twitter.com/search?q=%23hashtag1\" title=\"#hashtag1\" class=\"tweet-url hashtag\">#hashtag1</a> <a href=\"http://twitter.com/search?q=%23hashtag2\" title=\"#hashtag2\" class=\"tweet-url hashtag\">#hashtag2</a>"
104
+
105
+ - description: "Autolink hashtag preceded by a period"
106
+ text: "text.#hashtag"
107
+ expected: "text.<a href=\"http://twitter.com/search?q=%23hashtag\" title=\"#hashtag\" class=\"tweet-url hashtag\">#hashtag</a>"
108
+
109
+ - description: "DO NOT Autolink hashtag preceded by &"
110
+ text: "&#nbsp;"
111
+ expected: "&#nbsp;"
112
+
113
+ - description: "Autolink hashtag followed by ! (! not included)"
114
+ text: "text #hashtag!"
115
+ expected: "text <a href=\"http://twitter.com/search?q=%23hashtag\" title=\"#hashtag\" class=\"tweet-url hashtag\">#hashtag</a>!"
116
+
117
+ - description: "Autolink hashtag followed by Japanese"
118
+ text: "text #hashtagの"
119
+ expected: "text <a href=\"http://twitter.com/search?q=%23hashtag\" title=\"#hashtag\" class=\"tweet-url hashtag\">#hashtag</a>の"
120
+
121
+ - description: "Autolink hashtag preceded by full-width space (U+3000)"
122
+ text: "text #hashtag"
123
+ expected: "text <a href=\"http://twitter.com/search?q=%23hashtag\" title=\"#hashtag\" class=\"tweet-url hashtag\">#hashtag</a>"
124
+
125
+ - description: "Autolink hashtag followed by full-width space (U+3000)"
126
+ text: "#hashtag text"
127
+ expected: "<a href=\"http://twitter.com/search?q=%23hashtag\" title=\"#hashtag\" class=\"tweet-url hashtag\">#hashtag</a> text"
128
+
129
+ - description: "Autolink hashtag with full-width hash (U+FF03)"
130
+ text: "#hashtag"
131
+ expected: "<a href=\"http://twitter.com/search?q=%23hashtag\" title=\"#hashtag\" class=\"tweet-url hashtag\">#hashtag</a>"
132
+
133
+ urls:
134
+ - description: "Autolink trailing url"
135
+ text: "text http://example.com"
136
+ expected: "text <a href=\"http://example.com\">http://example.com</a>"
137
+
138
+ - description: "Autolink url in mid-text"
139
+ text: "text http://example.com more text"
140
+ expected: "text <a href=\"http://example.com\">http://example.com</a> more text"
141
+
142
+ - description: "Autolink url in Japanese text"
143
+ text: "いまなにしてるhttp://example.comいまなにしてる"
144
+ expected: "いまなにしてる<a href=\"http://example.com\">http://example.com</a>いまなにしてる"
145
+
146
+ - description: "Autolink url surrounded by parentheses"
147
+ text: "text (http://example.com)"
148
+ expected: "text (<a href=\"http://example.com\">http://example.com</a>)"
149
+
150
+ - description: "Autolink url containing unicode characters"
151
+ text: "I enjoy Macintosh Brand computers: http://✪df.ws/ejp"
152
+ expected: "I enjoy Macintosh Brand computers: <a href=\"http://✪df.ws/ejp\">http://✪df.ws/ejp</a>"
153
+
154
+ - description: "DO NOT Autolink url containing ! character in the domain"
155
+ text: "badly formatted http://foo!bar.com"
156
+ expected: "badly formatted http://foo!bar.com"
157
+
158
+ - description: "DO NOT Autolink url containing _ character in the domain"
159
+ text: "badly formatted http://foo_bar.com"
160
+ expected: "badly formatted http://foo_bar.com"
161
+
162
+ - description: "Autolink url preceded by :"
163
+ text: "text:http://example.com"
164
+ expected: "text:<a href=\"http://example.com\">http://example.com</a>"
165
+
166
+ - description: "Autolink url followed by ? (without it)"
167
+ text: "text http://example.com?"
168
+ expected: "text <a href=\"http://example.com\">http://example.com</a>?"
169
+
170
+ - description: "Autolink url followed by ! (without it)"
171
+ text: "text http://example.com!"
172
+ expected: "text <a href=\"http://example.com\">http://example.com</a>!"
173
+
174
+ - description: "Autolink url followed by , (without it)"
175
+ text: "text http://example.com,"
176
+ expected: "text <a href=\"http://example.com\">http://example.com</a>,"
177
+
178
+ - description: "Autolink url followed by . (without it)"
179
+ text: "text http://example.com."
180
+ expected: "text <a href=\"http://example.com\">http://example.com</a>."
181
+
182
+ - description: "Autolink url followed by : (without it)"
183
+ text: "text http://example.com:"
184
+ expected: "text <a href=\"http://example.com\">http://example.com</a>:"
185
+
186
+ - description: "Autolink url followed by ; (without it)"
187
+ text: "text http://example.com;"
188
+ expected: "text <a href=\"http://example.com\">http://example.com</a>;"
189
+
190
+ - description: "Autolink url followed by ] (without it)"
191
+ text: "text http://example.com]"
192
+ expected: "text <a href=\"http://example.com\">http://example.com</a>]"
193
+
194
+ - description: "Autolink url followed by ) (without it)"
195
+ text: "text http://example.com)"
196
+ expected: "text <a href=\"http://example.com\">http://example.com</a>)"
197
+
198
+ - description: "Autolink url followed by } (without it)"
199
+ text: "text http://example.com}"
200
+ expected: "text <a href=\"http://example.com\">http://example.com</a>}"
201
+
202
+ - description: "Autolink url followed by = (without it)"
203
+ text: "text http://example.com="
204
+ expected: "text <a href=\"http://example.com\">http://example.com</a>="
205
+
206
+ - description: "Autolink url followed by ' (without it)"
207
+ text: "text http://example.com'"
208
+ expected: "text <a href=\"http://example.com\">http://example.com</a>'"
209
+
210
+ - description: "DO NOT Autolink url preceded by '"
211
+ text: "text 'http://example.com"
212
+ expected: "text 'http://example.com"
213
+
214
+ - description: "DO NOT Autolink url preceded by /"
215
+ text: "text /http://example.com"
216
+ expected: "text /http://example.com"
217
+
218
+ - description: "DO NOT Autolink url preceded by !"
219
+ text: "text !http://example.com"
220
+ expected: "text !http://example.com"
221
+
222
+ - description: "DO NOT Autolink url preceded by ="
223
+ text: "text =http://example.com"
224
+ expected: "text =http://example.com"
225
+
226
+ - description: "Autolink url embedded in link tag"
227
+ text: "<link rel='true'>http://example.com</link>"
228
+ expected: "<link rel='true'><a href=\"http://example.com\">http://example.com</a></link>"
229
+
230
+ - description: "Autolink multiple urls"
231
+ text: "http://example.com https://sslexample.com http://sub.example.com"
232
+ expected: "<a href=\"http://example.com\">http://example.com</a> <a href=\"https://sslexample.com\">https://sslexample.com</a> <a href=\"http://sub.example.com\">http://sub.example.com</a>"
233
+
234
+ - description: "Autolink url with long TLD"
235
+ text: "http://example.mobi/path"
236
+ expected: "<a href=\"http://example.mobi/path\">http://example.mobi/path</a>"
237
+
238
+ - description: "Autolink url without protocol (with www)"
239
+ text: "www.example.com"
240
+ expected: "<a href=\"http://www.example.com\">www.example.com</a>"
241
+
242
+ - description: "Autolink url without protocol (with WWW)"
243
+ text: "WWW.EXAMPLE.COM"
244
+ expected: "<a href=\"http://WWW.EXAMPLE.COM\">WWW.EXAMPLE.COM</a>"
245
+
246
+ all:
247
+ - description: "Autolink url does not overlap @username"
248
+ text: "Check out: http://example.com/test&@chasesechrist"
249
+ expected: "Check out: <a href=\"http://example.com/test&\">http://example.com/test&</a>@<a class=\"tweet-url username\" href=\"http://twitter.com/chasesechrist\">chasesechrist</a>"
250
+
@@ -0,0 +1,193 @@
1
+
2
+ tests:
3
+ mentions:
4
+ - description: "Extract mention at the begining of a tweet"
5
+ text: "@username reply"
6
+ expected: ["username"]
7
+
8
+ - description: "Extract mention at the end of a tweet"
9
+ text: "mention @username"
10
+ expected: ["username"]
11
+
12
+ - description: "Extract mention in the middle of a tweet"
13
+ text: "mention @username in the middle"
14
+ expected: ["username"]
15
+
16
+ - description: "Extract mention of username with underscore"
17
+ text: "mention @user_name"
18
+ expected: ["user_name"]
19
+
20
+ - description: "Extract mention of all numeric username"
21
+ text: "mention @12345"
22
+ expected: ["12345"]
23
+
24
+ - description: "Extract mention or multiple usernames"
25
+ text: "mention @username1 @username2"
26
+ expected: ["username1", "username2"]
27
+
28
+ - description: "Extract mention in the middle of a Japanese tweet"
29
+ text: "の@usernameに到着を待っている"
30
+ expected: ["username"]
31
+
32
+ - description: "DO NOT extract username ending in @"
33
+ text: "Current Status: @_@ (cc: @username)"
34
+ expected: ["username"]
35
+
36
+ - description: "Extract lone metion but not @user@user (too close to an email)"
37
+ text: "@username email me @test@example.com"
38
+ expected: ["username"]
39
+
40
+ replies:
41
+ - description: "Extract reply at the begining of a tweet"
42
+ text: "@username reply"
43
+ expected: "username"
44
+
45
+ - description: "Extract reply preceded by only a space"
46
+ text: " @username reply"
47
+ expected: "username"
48
+
49
+ - description: "Extract reply preceded by only a full-width space (U+3000)"
50
+ text: " @username reply"
51
+ expected: "username"
52
+
53
+ - description: "DO NOT Extract reply when preceded by text"
54
+ text: "a @username mention, not a reply"
55
+ expected:
56
+
57
+ - description: "DO NOT Extract reply when preceded by ."
58
+ text: ".@username mention, not a reply"
59
+ expected:
60
+
61
+ - description: "DO NOT Extract reply when preceded by /"
62
+ text: "/@username mention, not a reply"
63
+ expected:
64
+
65
+ - description: "DO NOT Extract reply when preceded by _"
66
+ text: "_@username mention, not a reply"
67
+ expected:
68
+
69
+ - description: "DO NOT Extract reply when preceded by -"
70
+ text: "-@username mention, not a reply"
71
+ expected:
72
+
73
+ - description: "DO NOT Extract reply when preceded by +"
74
+ text: "+@username mention, not a reply"
75
+ expected:
76
+
77
+ - description: "DO NOT Extract reply when preceded by #"
78
+ text: "#@username mention, not a reply"
79
+ expected:
80
+
81
+ - description: "DO NOT Extract reply when preceded by !"
82
+ text: "!@username mention, not a reply"
83
+ expected:
84
+
85
+ - description: "DO NOT Extract reply when preceded by @"
86
+ text: "@@username mention, not a reply"
87
+ expected:
88
+
89
+ urls:
90
+ - description: "Extract a lone URL"
91
+ text: "http://example.com"
92
+ expected: ["http://example.com"]
93
+
94
+ - description: "Extract valid URL: http://google.com"
95
+ text: "text http://google.com"
96
+ expected: ["http://google.com"]
97
+
98
+ - description: "Extract valid URL: http://foobar.com/#"
99
+ text: "text http://foobar.com/#"
100
+ expected: ["http://foobar.com/#"]
101
+
102
+ - description: "Extract valid URL: http://google.com/#foo"
103
+ text: "text http://google.com/#foo"
104
+ expected: ["http://google.com/#foo"]
105
+
106
+ - description: "Extract valid URL: http://google.com/#search?q=iphone%20-filter%3Alinks"
107
+ text: "text http://google.com/#search?q=iphone%20-filter%3Alinks"
108
+ expected: ["http://google.com/#search?q=iphone%20-filter%3Alinks"]
109
+
110
+ - description: "Extract valid URL: http://twitter.com/#search?q=iphone%20-filter%3Alinks"
111
+ text: "text http://twitter.com/#search?q=iphone%20-filter%3Alinks"
112
+ expected: ["http://twitter.com/#search?q=iphone%20-filter%3Alinks"]
113
+
114
+ - description: "Extract valid URL: http://www.boingboing.net/2007/02/14/katamari_damacy_phon.html"
115
+ text: "text http://www.boingboing.net/2007/02/14/katamari_damacy_phon.html"
116
+ expected: ["http://www.boingboing.net/2007/02/14/katamari_damacy_phon.html"]
117
+
118
+ - description: "Extract valid URL: http://somehost.com:3000"
119
+ text: "text http://somehost.com:3000"
120
+ expected: ["http://somehost.com:3000"]
121
+
122
+ - description: "Extract valid URL: http://x.com/~matthew+%-x"
123
+ text: "text http://x.com/~matthew+%-x"
124
+ expected: ["http://x.com/~matthew+%-x"]
125
+
126
+ - description: "Extract valid URL: http://x.com/~matthew+%-,.;x"
127
+ text: "text http://x.com/~matthew+%-,.;x"
128
+ expected: ["http://x.com/~matthew+%-,.;x"]
129
+
130
+ - description: "Extract valid URL: http://x.com/,.;x"
131
+ text: "text http://x.com/,.;x"
132
+ expected: ["http://x.com/,.;x"]
133
+
134
+ - description: "Extract valid URL: http://en.wikipedia.org/wiki/Primer_(film)"
135
+ text: "text http://en.wikipedia.org/wiki/Primer_(film)"
136
+ expected: ["http://en.wikipedia.org/wiki/Primer_(film)"]
137
+
138
+ - description: "Extract valid URL: http://www.ams.org/bookstore-getitem/item=mbk-59"
139
+ text: "text http://www.ams.org/bookstore-getitem/item=mbk-59"
140
+ expected: ["http://www.ams.org/bookstore-getitem/item=mbk-59"]
141
+
142
+ - description: "Extract valid URL: http://✪df.ws/ejp"
143
+ text: "text http://✪df.ws/ejp"
144
+ expected: ["http://✪df.ws/ejp"]
145
+
146
+ - description: "Extract valid URL: http://chilp.it/?77e8fd"
147
+ text: "text http://chilp.it/?77e8fd"
148
+ expected: ["http://chilp.it/?77e8fd"]
149
+
150
+ - description: "DO NOT extract invalid URL: http://doman-dash_2314352345_dfasd.foo-cow_4352.com"
151
+ text: "text http://doman-dash_2314352345_dfasd.foo-cow_4352.com"
152
+ expected: []
153
+
154
+ - description: "DO NOT extract invalid URL: http://no-tld"
155
+ text: "text http://no-tld"
156
+ expected: []
157
+
158
+ - description: "DO NOT extract invalid URL: http://tld-too-short.x"
159
+ text: "text http://tld-too-short.x"
160
+ expected: []
161
+
162
+ hashtags:
163
+ - description: "Extract an all-alpha hashtag"
164
+ text: "a #hashtag here"
165
+ expected: ["hashtag"]
166
+
167
+ - description: "Extract a letter-then-number hashtag"
168
+ text: "this is #hashtag1"
169
+ expected: ["hashtag1"]
170
+
171
+ - description: "Extract a number-then-letter hashtag"
172
+ text: "#1hashtag is this"
173
+ expected: ["1hashtag"]
174
+
175
+ - description: "DO NOT Extract an all-numeric hashtag"
176
+ text: "On the #16 bus"
177
+ expected: []
178
+
179
+ - description: "Extract a hashtag containing ñ"
180
+ text: "I'll write more tests #mañana"
181
+ expected: ["mañana"]
182
+
183
+ - description: "Extract a hashtag containing é"
184
+ text: "Working remotely #café"
185
+ expected: ["café"]
186
+
187
+ - description: "Extract a hashtag containing ü"
188
+ text: "Getting my Oktoberfest on #münchen"
189
+ expected: ["münchen"]
190
+
191
+ - description: "DO NOT Extract a hashtag containing Japanese"
192
+ text: "this is not valid: # 会議中 ハッシュ"
193
+ expected: []
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tweetparser
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Paul Battley
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2010-02-23 00:00:00 +00:00
12
+ date: 2010-02-24 00:00:00 +00:00
13
13
  default_executable:
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
@@ -51,7 +51,11 @@ extensions: []
51
51
  extra_rdoc_files: []
52
52
 
53
53
  files:
54
+ - test/conformance_test.rb
54
55
  - test/parser_test.rb
56
+ - test/twitter-text-conformance/autolink.yml
57
+ - test/twitter-text-conformance/extract.yml
58
+ - test/twitter-text-conformance/README
55
59
  - lib/tweetparser/grammar.treetop
56
60
  - lib/tweetparser.rb
57
61
  has_rdoc: true