RubyGems - tweetparser - Versions diffs - 0.1.0 → 0.2.0 - Mend

tweetparser 0.1.0 → 0.2.0

Files changed (8) hide show

data/lib/tweetparser.rb +12 -0
data/lib/tweetparser/grammar.treetop +37 -9
data/test/conformance_test.rb +48 -0
data/test/parser_test.rb +30 -9
data/test/twitter-text-conformance/README +6 -0
data/test/twitter-text-conformance/autolink.yml +250 -0
data/test/twitter-text-conformance/extract.yml +193 -0
metadata +6 -2

@@ -1,3 +1,15 @@
 require "treetop"
 require "polyglot"
 require "tweetparser/grammar"
+module TweetParser
+  def self.parse(input)
+    kcode = $KCODE
+    $KCODE = "n"
+    parser = TweetContentParser.new
+    parsed = parser.parse(input)
+    $KCODE = kcode
+    return nil unless parsed
+    parsed.content
+  end
+end

data/lib/tweetparser/grammar.treetop CHANGED

@@ -1,38 +1,66 @@
 grammar TweetContent
   rule tweet
-    (url / html / space / newline / atref / hashtag / text)* {
+    (url / html / space / newline / list / username / hashtag / slash / text)* {
       def content
         elements.map{ |e| e.content }
       end
     }
   end
+  rule subdomain
+    ([a-zA-Z0-9\-] / [^\x20-\x7F])+
+  end
   rule url
-    "http" "s"? "://" [\./a-zA-Z0-9\?#=\-_&%]+ {
+    (("http" / "HTTP") [sS]? "://" / "www." / "WWW.") subdomain ("." subdomain)+ ("/" [\.a-zA-Z0-9\?#=\-_&%]*)* {
       def content
         [:url, text_value]
       end
     }
   end
-  rule atref
-    "@" [a-zA-Z0-9_]+ {
+  rule name
+    [a-zA-Z0-9_]+
+  end
+  rule name_with_letters
+    [a-zA-Z] [a-zA-Z0-9_]* / [0-9_]+ [a-zA-Z] [a-zA-Z0-9_]*
+  end
+  rule username
+    ("@" / "＠") name {
+      def content
+        [:username, text_value]
+      end
+    }
+  end
+  rule list
+    username "/" name {
       def content
-        [:atref, text_value]
+        [:list, text_value]
       end
     }
   end
   rule hashtag
-    "#" [a-zA-Z0-9_]+ {
+    ("#" / "＃") name_with_letters {
       def content
         [:hashtag, text_value]
       end
     }
   end
+  rule slash
+    "/" name {
+      def content
+        [:slash, text_value]
+      end
+    }
+  end
   rule text
-    ([^h\s] / "h" !("ttp" "s"? "://"))+ {
+    [\S]+ {
       def content
         [:text, text_value]
       end
@@ -50,13 +78,13 @@ grammar TweetContent
   rule newline
     "\r"? "\n" {
       def content
-        [:newline]
+        [:newline, text_value]
       end
     }
   end
   rule space
-    " "+ {
+    (" " / "　")+ {
       def content
         [:space, text_value]
       end

data/test/conformance_test.rb ADDED

@@ -0,0 +1,48 @@
+# encoding: UTF-8
+$:.unshift(File.expand_path("../../lib", __FILE__))
+require "test/unit"
+require "shoulda"
+require "tweetparser"
+require "yaml"
+require "cgi"
+class AutolinkConformanceTest < Test::Unit::TestCase
+  DATA_PATH = File.expand_path("../twitter-text-conformance/autolink.yml", __FILE__)
+  def assert_autolink(expected, input)
+    sexpr = TweetParser.parse(input)
+    assert sexpr, "Failed to parse #{input}"
+    actual = sexpr.inject(""){ |output, (type, value)|
+      output <<
+        case type
+        when :username
+          at, username = value.scan(/^.|.*/)
+          %{#{at}<a class="tweet-url username" href="http://twitter.com/#{username}">#{username}</a>}
+        when :list
+          at, list = value.scan(/^.|.*/)
+          %{#{at}<a class="tweet-url list-slug" href=\"http://twitter.com/#{list}">#{list}</a>}
+        when :hashtag
+          hash, hashtag = value.scan(/^.|.*/)
+          %{<a href="http://twitter.com/search?q=%23#{hashtag}" }+
+          %{title="\##{hashtag}" class="tweet-url hashtag">#{value}</a>}
+        when :url
+          href = value
+          href = "http://" + value unless href =~ /^http/i
+          %{<a href="#{href}">#{value}</a>}
+        else
+          value
+        end
+    }
+    assert_equal expected, actual, sexpr.inspect
+  end
+  YAML.load(File.read(DATA_PATH))["tests"].each do |section, tests|
+    context "when testing #{section}" do
+      tests.each do |hash|
+        should hash["description"] do
+          assert_autolink hash["expected"], hash["text"]
+        end
+      end
+    end
+  end
+end

data/test/parser_test.rb CHANGED

@@ -6,12 +6,8 @@ require "tweetparser"
 class ParserTest < Test::Unit::TestCase
-  def setup
-    @parser = TweetContentParser.new
-  end
   def assert_parses(expected, input)
-    actual = @parser.parse(input).content
+    actual = TweetParser.parse(input)
     assert_equal expected, actual
   end
@@ -24,6 +20,21 @@ class ParserTest < Test::Unit::TestCase
     assert_parses [[:url, s]], s
   end
+  should "extract url with www and no http" do
+    s = "www.example.com/mail/?ui=2&shva=1#inbox"
+    assert_parses [[:url, s]], s
+  end
+  should "extract IDN url" do
+    s = "http://✪df.ws/ejp"
+    assert_parses [[:url, s]], s
+  end
+  should "not extract invalid domain" do
+    s = "http://example_com/mail/?ui=2&shva=1#inbox"
+    assert_parses [[:text, s]], s
+  end
   should "extract hashtag" do
     s = "#HashTag2010"
     assert_parses [[:hashtag, s]], s
@@ -31,7 +42,7 @@ class ParserTest < Test::Unit::TestCase
   should "extract at-references" do
     s = "@AtRef_3000"
-    assert_parses [[:atref, s]], s
+    assert_parses [[:username, s]], s
   end
   should "extract HTML" do
@@ -39,9 +50,19 @@ class ParserTest < Test::Unit::TestCase
     assert_parses [[:html, s]], s
   end
+  should "extract a slash comment" do
+    s = %{/via}
+    assert_parses [[:slash, s]], s
+  end
+  should "extract a list" do
+    s = %{@username/list}
+    assert_parses [[:list, s]], s
+  end
   should "extract words spaces and new lines" do
     s = "this string\nhas spaces!"
-    expected = [[:text, "this"], [:space, " "], [:text, "string"], [:newline],
+    expected = [[:text, "this"], [:space, " "], [:text, "string"], [:newline, "\n"],
                 [:text, "has"], [:space, " "], [:text, "spaces!"]]
     assert_parses expected, s
   end
@@ -51,7 +72,7 @@ class ParserTest < Test::Unit::TestCase
     expected = [[:text, "Another"], [:space, " "], [:text, "test:"], [:space, "  "],
                 [:html, "<a href=\"http://twitpic.com/14vzny\" target=\"_blank\">"],
                 [:html, "<img src=\"http://twitpic.com/show/mini/14vzny\" />"],
-                [:html, "</a>"], [:newline],
+                [:html, "</a>"], [:newline, "\n"],
                 [:url, "http://twitpic.com/14vzny"],
                 [:space, " "], [:text, "3"], [:space, " "],
                 [:url, "http://twitpic.com/14vzny"]]
@@ -60,7 +81,7 @@ class ParserTest < Test::Unit::TestCase
   should "extract elements from real-world sample" do
     s = %{RT @newsbrooke Tonight’s the night!: Hope you’ll all tune in tonight to watch On Expenses at 9pm on BBC4 http://bit.ly/cgbkmF #mps #uk}
-    expected = [[:text, "RT"], [:space, " "], [:atref, "@newsbrooke"], [:space, " "],
+    expected = [[:text, "RT"], [:space, " "], [:username, "@newsbrooke"], [:space, " "],
                 [:text, "Tonight’s"], [:space, " "], [:text, "the"], [:space, " "],
                 [:text, "night!:"], [:space, " "], [:text, "Hope"], [:space, " "],
                 [:text, "you’ll"], [:space, " "], [:text, "all"], [:space, " "],

data/test/twitter-text-conformance/README ADDED

@@ -0,0 +1,6 @@
+Yaml files that define the conformance testing for twitter-text-* libraries.
+== TODO
+ * Describe the format in the README
+ * Add more tests (ongoing)

data/test/twitter-text-conformance/autolink.yml ADDED

@@ -0,0 +1,250 @@
+tests:
+  usernames:
+    - description: "Autolink trailing username"
+      text: "text @username"
+      expected: "text @<a class=\"tweet-url username\" href=\"http://twitter.com/username\">username</a>"
+    - description: "Autolink username at the beginning"
+      text: "@username text"
+      expected: "@<a class=\"tweet-url username\" href=\"http://twitter.com/username\">username</a> text"
+    - description: "DO NOT Autolink username preceded by a letter"
+      text: "meet@the beach"
+      expected: "meet@the beach"
+    - description: "Autolink username preceded by puctuation"
+      text: "great.@username"
+      expected: "great.@<a class=\"tweet-url username\" href=\"http://twitter.com/username\">username</a>"
+    - description: "Autolink username followed by puctuation"
+      text: "@username&^$%^"
+      expected: "@<a class=\"tweet-url username\" href=\"http://twitter.com/username\">username</a>&^$%^"
+    - description: "Autolink username followed by Japanese"
+      text: "@usernameの"
+      expected: "@<a class=\"tweet-url username\" href=\"http://twitter.com/username\">username</a>の"
+    - description: "Autolink username preceded by Japanese"
+      text: "あ@username"
+      expected: "あ@<a class=\"tweet-url username\" href=\"http://twitter.com/username\">username</a>"
+    - description: "Autolink username surrounded by Japanese"
+      text: "あ@usernameの"
+      expected: "あ@<a class=\"tweet-url username\" href=\"http://twitter.com/username\">username</a>の"
+    - description: "Autolink username with full-width at sign (U+FF20)"
+      text: "＠username"
+      expected: "＠<a class=\"tweet-url username\" href=\"http://twitter.com/username\">username</a>"
+    - description: "DO NOT Autolink username over 20 characters"
+      text: "@username9012345678901"
+      expected: "@<a class=\"tweet-url username\" href=\"http://twitter.com/username901234567890\">username901234567890</a>1"
+  lists:
+    - description: "Autolink list preceded by a space"
+      text: "text @username/list"
+      expected: "text @<a class=\"tweet-url list-slug\" href=\"http://twitter.com/username/list\">username/list</a>"
+    - description: "DO NOT Autolink list when space follows slash"
+      text: "text @username/ list"
+      expected: "text @<a class=\"tweet-url username\" href=\"http://twitter.com/username\">username</a>/ list"
+    - description: "DO NOT Autolink list with empty username"
+      text: "text @/list"
+      expected: "text @/list"
+    - description: "Autolink list at the beginning"
+      text: "@username/list"
+      expected: "@<a class=\"tweet-url list-slug\" href=\"http://twitter.com/username/list\">username/list</a>"
+    - description: "DO NOT Autolink list preceded by letter"
+      text: "meet@the/beach"
+      expected: "meet@the/beach"
+    - description: "Autolink list preceded by puctuation"
+      text: "great.@username/list"
+      expected: "great.@<a class=\"tweet-url list-slug\" href=\"http://twitter.com/username/list\">username/list</a>"
+    - description: "Autolink list followed by puctuation"
+      text: "@username/list&^$%^"
+      expected: "@<a class=\"tweet-url list-slug\" href=\"http://twitter.com/username/list\">username/list</a>&^$%^"
+    - description: "Autolink list name over 80 characters (truncated to 80)"
+      text: "@username/list5678901234567890123456789012345678901234567890123456789012345678901234567890A"
+      expected: "@<a class=\"tweet-url list-slug\" href=\"http://twitter.com/username/list5678901234567890123456789012345678901234567890123456789012345678901234567890\">username/list5678901234567890123456789012345678901234567890123456789012345678901234567890</a>A"
+  hashtags:
+    - description: "Autolink trailing hashtag"
+      text: "text #hashtag"
+      expected: "text <a href=\"http://twitter.com/search?q=%23hashtag\" title=\"#hashtag\" class=\"tweet-url hashtag\">#hashtag</a>"
+    - description: "Autolink alphanumeric hashtag (letter-number-letter)"
+      text: "text #hash0tag"
+      expected: "text <a href=\"http://twitter.com/search?q=%23hash0tag\" title=\"#hash0tag\" class=\"tweet-url hashtag\">#hash0tag</a>"
+    - description: "Autolink alphanumeric hashtag (number-letter)"
+      text: "text #1tag"
+      expected: "text <a href=\"http://twitter.com/search?q=%231tag\" title=\"#1tag\" class=\"tweet-url hashtag\">#1tag</a>"
+    - description: "Autolink hashtag with underscore"
+      text: "text #hash_tag"
+      expected: "text <a href=\"http://twitter.com/search?q=%23hash_tag\" title=\"#hash_tag\" class=\"tweet-url hashtag\">#hash_tag</a>"
+    - description: "DO NOT Autolink all-numeric hashtags"
+      text: "text #1234"
+      expected: "text #1234"
+    - description: "DO NOT Autolink hashtag preceded by a letter"
+      text: "text#hashtag"
+      expected: "text#hashtag"
+    - description: "Autolink multiple hashtags"
+      text: "text #hashtag1 #hashtag2"
+      expected: "text <a href=\"http://twitter.com/search?q=%23hashtag1\" title=\"#hashtag1\" class=\"tweet-url hashtag\">#hashtag1</a> <a href=\"http://twitter.com/search?q=%23hashtag2\" title=\"#hashtag2\" class=\"tweet-url hashtag\">#hashtag2</a>"
+    - description: "Autolink hashtag preceded by a period"
+      text: "text.#hashtag"
+      expected: "text.<a href=\"http://twitter.com/search?q=%23hashtag\" title=\"#hashtag\" class=\"tweet-url hashtag\">#hashtag</a>"
+    - description: "DO NOT Autolink hashtag preceded by &"
+      text: "&#nbsp;"
+      expected: "&#nbsp;"
+    - description: "Autolink hashtag followed by ! (! not included)"
+      text: "text #hashtag!"
+      expected: "text <a href=\"http://twitter.com/search?q=%23hashtag\" title=\"#hashtag\" class=\"tweet-url hashtag\">#hashtag</a>!"
+    - description: "Autolink hashtag followed by Japanese"
+      text: "text #hashtagの"
+      expected: "text <a href=\"http://twitter.com/search?q=%23hashtag\" title=\"#hashtag\" class=\"tweet-url hashtag\">#hashtag</a>の"
+    - description: "Autolink hashtag preceded by full-width space (U+3000)"
+      text: "text　#hashtag"
+      expected: "text　<a href=\"http://twitter.com/search?q=%23hashtag\" title=\"#hashtag\" class=\"tweet-url hashtag\">#hashtag</a>"
+    - description: "Autolink hashtag followed by full-width space (U+3000)"
+      text: "#hashtag　text"
+      expected: "<a href=\"http://twitter.com/search?q=%23hashtag\" title=\"#hashtag\" class=\"tweet-url hashtag\">#hashtag</a>　text"
+    - description: "Autolink hashtag with full-width hash (U+FF03)"
+      text: "＃hashtag"
+      expected: "<a href=\"http://twitter.com/search?q=%23hashtag\" title=\"#hashtag\" class=\"tweet-url hashtag\">＃hashtag</a>"
+  urls:
+    - description: "Autolink trailing url"
+      text: "text http://example.com"
+      expected: "text <a href=\"http://example.com\">http://example.com</a>"
+    - description: "Autolink url in mid-text"
+      text: "text http://example.com more text"
+      expected: "text <a href=\"http://example.com\">http://example.com</a> more text"
+    - description: "Autolink url in Japanese text"
+      text: "いまなにしてるhttp://example.comいまなにしてる"
+      expected: "いまなにしてる<a href=\"http://example.com\">http://example.com</a>いまなにしてる"
+    - description: "Autolink url surrounded by parentheses"
+      text: "text (http://example.com)"
+      expected: "text (<a href=\"http://example.com\">http://example.com</a>)"
+    - description: "Autolink url containing unicode characters"
+      text: "I enjoy Macintosh Brand computers: http://✪df.ws/ejp"
+      expected: "I enjoy Macintosh Brand computers: <a href=\"http://✪df.ws/ejp\">http://✪df.ws/ejp</a>"
+    - description: "DO NOT Autolink url containing ! character in the domain"
+      text: "badly formatted http://foo!bar.com"
+      expected: "badly formatted http://foo!bar.com"
+    - description: "DO NOT Autolink url containing _ character in the domain"
+      text: "badly formatted http://foo_bar.com"
+      expected: "badly formatted http://foo_bar.com"
+    - description: "Autolink url preceded by :"
+      text: "text:http://example.com"
+      expected: "text:<a href=\"http://example.com\">http://example.com</a>"
+    - description: "Autolink url followed by ? (without it)"
+      text: "text http://example.com?"
+      expected: "text <a href=\"http://example.com\">http://example.com</a>?"
+    - description: "Autolink url followed by ! (without it)"
+      text: "text http://example.com!"
+      expected: "text <a href=\"http://example.com\">http://example.com</a>!"
+    - description: "Autolink url followed by , (without it)"
+      text: "text http://example.com,"
+      expected: "text <a href=\"http://example.com\">http://example.com</a>,"
+    - description: "Autolink url followed by . (without it)"
+      text: "text http://example.com."
+      expected: "text <a href=\"http://example.com\">http://example.com</a>."
+    - description: "Autolink url followed by : (without it)"
+      text: "text http://example.com:"
+      expected: "text <a href=\"http://example.com\">http://example.com</a>:"
+    - description: "Autolink url followed by ; (without it)"
+      text: "text http://example.com;"
+      expected: "text <a href=\"http://example.com\">http://example.com</a>;"
+    - description: "Autolink url followed by ] (without it)"
+      text: "text http://example.com]"
+      expected: "text <a href=\"http://example.com\">http://example.com</a>]"
+    - description: "Autolink url followed by ) (without it)"
+      text: "text http://example.com)"
+      expected: "text <a href=\"http://example.com\">http://example.com</a>)"
+    - description: "Autolink url followed by } (without it)"
+      text: "text http://example.com}"
+      expected: "text <a href=\"http://example.com\">http://example.com</a>}"
+    - description: "Autolink url followed by = (without it)"
+      text: "text http://example.com="
+      expected: "text <a href=\"http://example.com\">http://example.com</a>="
+    - description: "Autolink url followed by ' (without it)"
+      text: "text http://example.com'"
+      expected: "text <a href=\"http://example.com\">http://example.com</a>'"
+    - description: "DO NOT Autolink url preceded by '"
+      text: "text 'http://example.com"
+      expected: "text 'http://example.com"
+    - description: "DO NOT Autolink url preceded by /"
+      text: "text /http://example.com"
+      expected: "text /http://example.com"
+    - description: "DO NOT Autolink url preceded by !"
+      text: "text !http://example.com"
+      expected: "text !http://example.com"
+    - description: "DO NOT Autolink url preceded by ="
+      text: "text =http://example.com"
+      expected: "text =http://example.com"
+    - description: "Autolink url embedded in link tag"
+      text: "<link rel='true'>http://example.com</link>"
+      expected: "<link rel='true'><a href=\"http://example.com\">http://example.com</a></link>"
+    - description: "Autolink multiple urls"
+      text: "http://example.com https://sslexample.com http://sub.example.com"
+      expected: "<a href=\"http://example.com\">http://example.com</a> <a href=\"https://sslexample.com\">https://sslexample.com</a> <a href=\"http://sub.example.com\">http://sub.example.com</a>"
+    - description: "Autolink url with long TLD"
+      text: "http://example.mobi/path"
+      expected: "<a href=\"http://example.mobi/path\">http://example.mobi/path</a>"
+    - description: "Autolink url without protocol (with www)"
+      text: "www.example.com"
+      expected: "<a href=\"http://www.example.com\">www.example.com</a>"
+    - description: "Autolink url without protocol (with WWW)"
+      text: "WWW.EXAMPLE.COM"
+      expected: "<a href=\"http://WWW.EXAMPLE.COM\">WWW.EXAMPLE.COM</a>"
+  all:
+    - description: "Autolink url does not overlap @username"
+      text: "Check out: http://example.com/test&@chasesechrist"
+      expected: "Check out: <a href=\"http://example.com/test&\">http://example.com/test&</a>@<a class=\"tweet-url username\" href=\"http://twitter.com/chasesechrist\">chasesechrist</a>"

data/test/twitter-text-conformance/extract.yml ADDED

@@ -0,0 +1,193 @@
+tests:
+  mentions:
+    - description: "Extract mention at the begining of a tweet"
+      text: "@username reply"
+      expected: ["username"]
+    - description: "Extract mention at the end of a tweet"
+      text: "mention @username"
+      expected: ["username"]
+    - description: "Extract mention in the middle of a tweet"
+      text: "mention @username in the middle"
+      expected: ["username"]
+    - description: "Extract mention of username with underscore"
+      text: "mention @user_name"
+      expected: ["user_name"]
+    - description: "Extract mention of all numeric username"
+      text: "mention @12345"
+      expected: ["12345"]
+    - description: "Extract mention or multiple usernames"
+      text: "mention @username1 @username2"
+      expected: ["username1", "username2"]
+    - description: "Extract mention in the middle of a Japanese tweet"
+      text: "の@usernameに到着を待っている"
+      expected: ["username"]
+    - description: "DO NOT extract username ending in @"
+      text: "Current Status: @_@ (cc: @username)"
+      expected: ["username"]
+    - description: "Extract lone metion but not @user@user (too close to an email)"
+      text: "@username email me @test@example.com"
+      expected: ["username"]
+  replies:
+    - description: "Extract reply at the begining of a tweet"
+      text: "@username reply"
+      expected: "username"
+    - description: "Extract reply preceded by only a space"
+      text: " @username reply"
+      expected: "username"
+    - description: "Extract reply preceded by only a full-width space (U+3000)"
+      text: "　@username reply"
+      expected: "username"
+    - description: "DO NOT Extract reply when preceded by text"
+      text: "a @username mention, not a reply"
+      expected:
+    - description: "DO NOT Extract reply when preceded by ."
+      text: ".@username mention, not a reply"
+      expected:
+    - description: "DO NOT Extract reply when preceded by /"
+      text: "/@username mention, not a reply"
+      expected:
+    - description: "DO NOT Extract reply when preceded by _"
+      text: "_@username mention, not a reply"
+      expected:
+    - description: "DO NOT Extract reply when preceded by -"
+      text: "-@username mention, not a reply"
+      expected:
+    - description: "DO NOT Extract reply when preceded by +"
+      text: "+@username mention, not a reply"
+      expected:
+    - description: "DO NOT Extract reply when preceded by #"
+      text: "#@username mention, not a reply"
+      expected:
+    - description: "DO NOT Extract reply when preceded by !"
+      text: "!@username mention, not a reply"
+      expected:
+    - description: "DO NOT Extract reply when preceded by @"
+      text: "@@username mention, not a reply"
+      expected:
+  urls:
+    - description: "Extract a lone URL"
+      text: "http://example.com"
+      expected: ["http://example.com"]
+    - description: "Extract valid URL: http://google.com"
+      text: "text http://google.com"
+      expected: ["http://google.com"]
+    - description: "Extract valid URL: http://foobar.com/#"
+      text: "text http://foobar.com/#"
+      expected: ["http://foobar.com/#"]
+    - description: "Extract valid URL: http://google.com/#foo"
+      text: "text http://google.com/#foo"
+      expected: ["http://google.com/#foo"]
+    - description: "Extract valid URL: http://google.com/#search?q=iphone%20-filter%3Alinks"
+      text: "text http://google.com/#search?q=iphone%20-filter%3Alinks"
+      expected: ["http://google.com/#search?q=iphone%20-filter%3Alinks"]
+    - description: "Extract valid URL: http://twitter.com/#search?q=iphone%20-filter%3Alinks"
+      text: "text http://twitter.com/#search?q=iphone%20-filter%3Alinks"
+      expected: ["http://twitter.com/#search?q=iphone%20-filter%3Alinks"]
+    - description: "Extract valid URL: http://www.boingboing.net/2007/02/14/katamari_damacy_phon.html"
+      text: "text http://www.boingboing.net/2007/02/14/katamari_damacy_phon.html"
+      expected: ["http://www.boingboing.net/2007/02/14/katamari_damacy_phon.html"]
+    - description: "Extract valid URL: http://somehost.com:3000"
+      text: "text http://somehost.com:3000"
+      expected: ["http://somehost.com:3000"]
+    - description: "Extract valid URL: http://x.com/~matthew+%-x"
+      text: "text http://x.com/~matthew+%-x"
+      expected: ["http://x.com/~matthew+%-x"]
+    - description: "Extract valid URL: http://x.com/~matthew+%-,.;x"
+      text: "text http://x.com/~matthew+%-,.;x"
+      expected: ["http://x.com/~matthew+%-,.;x"]
+    - description: "Extract valid URL: http://x.com/,.;x"
+      text: "text http://x.com/,.;x"
+      expected: ["http://x.com/,.;x"]
+    - description: "Extract valid URL: http://en.wikipedia.org/wiki/Primer_(film)"
+      text: "text http://en.wikipedia.org/wiki/Primer_(film)"
+      expected: ["http://en.wikipedia.org/wiki/Primer_(film)"]
+    - description: "Extract valid URL: http://www.ams.org/bookstore-getitem/item=mbk-59"
+      text: "text http://www.ams.org/bookstore-getitem/item=mbk-59"
+      expected: ["http://www.ams.org/bookstore-getitem/item=mbk-59"]
+    - description: "Extract valid URL: http://✪df.ws/ejp"
+      text: "text http://✪df.ws/ejp"
+      expected: ["http://✪df.ws/ejp"]
+    - description: "Extract valid URL: http://chilp.it/?77e8fd"
+      text: "text http://chilp.it/?77e8fd"
+      expected: ["http://chilp.it/?77e8fd"]
+    - description: "DO NOT extract invalid URL: http://doman-dash_2314352345_dfasd.foo-cow_4352.com"
+      text: "text http://doman-dash_2314352345_dfasd.foo-cow_4352.com"
+      expected: []
+    - description: "DO NOT extract invalid URL: http://no-tld"
+      text: "text http://no-tld"
+      expected: []
+    - description: "DO NOT extract invalid URL: http://tld-too-short.x"
+      text: "text http://tld-too-short.x"
+      expected: []
+  hashtags:
+    - description: "Extract an all-alpha hashtag"
+      text: "a #hashtag here"
+      expected: ["hashtag"]
+    - description: "Extract a letter-then-number hashtag"
+      text: "this is #hashtag1"
+      expected: ["hashtag1"]
+    - description: "Extract a number-then-letter hashtag"
+      text: "#1hashtag is this"
+      expected: ["1hashtag"]
+    - description: "DO NOT Extract an all-numeric hashtag"
+      text: "On the #16 bus"
+      expected: []
+    - description: "Extract a hashtag containing ñ"
+      text: "I'll write more tests #mañana"
+      expected: ["mañana"]
+    - description: "Extract a hashtag containing é"
+      text: "Working remotely #café"
+      expected: ["café"]
+    - description: "Extract a hashtag containing ü"
+      text: "Getting my Oktoberfest on #münchen"
+      expected: ["münchen"]
+    - description: "DO NOT Extract a hashtag containing Japanese"
+      text: "this is not valid: # 会議中 ハッシュ"
+      expected: []

metadata CHANGED

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: tweetparser
 version: !ruby/object:Gem::Version
-  version: 0.1.0
+  version: 0.2.0
 platform: ruby
 authors:
 - Paul Battley
@@ -9,7 +9,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2010-02-23 00:00:00 +00:00
+date: 2010-02-24 00:00:00 +00:00
 default_executable:
 dependencies:
 - !ruby/object:Gem::Dependency
@@ -51,7 +51,11 @@ extensions: []
 extra_rdoc_files: []
 files:
+- test/conformance_test.rb
 - test/parser_test.rb
+- test/twitter-text-conformance/autolink.yml
+- test/twitter-text-conformance/extract.yml
+- test/twitter-text-conformance/README
 - lib/tweetparser/grammar.treetop
 - lib/tweetparser.rb
 has_rdoc: true