RubyGems - tweetparser - Versions diffs - 0.1.0 → 0.2.0 - Mend

tweetparser 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

data/lib/tweetparser.rb +12 -0
data/lib/tweetparser/grammar.treetop +37 -9
data/test/conformance_test.rb +48 -0
data/test/parser_test.rb +30 -9
data/test/twitter-text-conformance/README +6 -0
data/test/twitter-text-conformance/autolink.yml +250 -0
data/test/twitter-text-conformance/extract.yml +193 -0
metadata +6 -2

data/lib/tweetparser.rb CHANGED

@@ -1,3 +1,15 @@
 require "treetop"
 require "polyglot"
 require "tweetparser/grammar"
+module TweetParser
+  def self.parse(input)
+    kcode = $KCODE
+    $KCODE = "n"
+    parser = TweetContentParser.new
+    parsed = parser.parse(input)
+    $KCODE = kcode
+    return nil unless parsed
+    parsed.content
+  end
+end

data/lib/tweetparser/grammar.treetop CHANGED

@@ -1,38 +1,66 @@
 grammar TweetContent
   rule tweet
-    (url / html / space / newline / atref / hashtag / text)* {
+    (url / html / space / newline / list / username / hashtag / slash / text)* {
       def content
         elements.map{ |e| e.content }
       end
     }
   end
+  rule subdomain
+    ([a-zA-Z0-9\-] / [^\x20-\x7F])+
+  end
   rule url
-    "http" "s"? "://" [\./a-zA-Z0-9\?#=\-_&%]+ {
+    (("http" / "HTTP") [sS]? "://" / "www." / "WWW.") subdomain ("." subdomain)+ ("/" [\.a-zA-Z0-9\?#=\-_&%]*)* {
       def content
         [:url, text_value]
       end
     }
   end
-  rule atref
-    "@" [a-zA-Z0-9_]+ {
+  rule name
+    [a-zA-Z0-9_]+
+  end
+  rule name_with_letters
+    [a-zA-Z] [a-zA-Z0-9_]* / [0-9_]+ [a-zA-Z] [a-zA-Z0-9_]*
+  end
+  rule username
+    ("@" / "＠") name {
+      def content
+        [:username, text_value]
+      end
+    }
+  end
+  rule list
+    username "/" name {
       def content
-        [:atref, text_value]
+        [:list, text_value]
       end
     }
   end
   rule hashtag
-    "#" [a-zA-Z0-9_]+ {
+    ("#" / "＃") name_with_letters {
       def content
         [:hashtag, text_value]
       end
     }
   end
+  rule slash
+    "/" name {
+      def content
+        [:slash, text_value]
+      end
+    }
+  end
   rule text
-    ([^h\s] / "h" !("ttp" "s"? "://"))+ {
+    [\S]+ {
       def content
         [:text, text_value]
       end
@@ -50,13 +78,13 @@ grammar TweetContent
   rule newline
     "\r"? "\n" {
       def content
-        [:newline]
+        [:newline, text_value]
       end
     }
   end
   rule space
-    " "+ {
+    (" " / "　")+ {
       def content
         [:space, text_value]
       end

data/test/conformance_test.rb ADDED

@@ -0,0 +1,48 @@
+# encoding: UTF-8
+$:.unshift(File.expand_path("../../lib", __FILE__))
+require "test/unit"
+require "shoulda"
+require "tweetparser"
+require "yaml"
+require "cgi"
+class AutolinkConformanceTest < Test::Unit::TestCase
+  DATA_PATH = File.expand_path("../twitter-text-conformance/autolink.yml", __FILE__)
+  def assert_autolink(expected, input)
+    sexpr = TweetParser.parse(input)
+    assert sexpr, "Failed to parse #{input}"
+    actual = sexpr.inject(""){ |output, (type, value)|
+      output <<
+        case type
+        when :username
+          at, username = value.scan(/^.|.*/)
+          %{#{at}<a class="tweet-url username" href="http://twitter.com/#{username}">#{username}</a>}
+        when :list
+          at, list = value.scan(/^.|.*/)
+          %{#{at}<a class="tweet-url list-slug" href=\"http://twitter.com/#{list}">#{list}</a>}
+        when :hashtag
+          hash, hashtag = value.scan(/^.|.*/)
+          %{<a href="http://twitter.com/search?q=%23#{hashtag}" }+
+          %{title="\##{hashtag}" class="tweet-url hashtag">#{value}</a>}
+        when :url
+          href = value
+          href = "http://" + value unless href =~ /^http/i
+          %{<a href="#{href}">#{value}</a>}
+        else
+          value
+        end
+    }
+    assert_equal expected, actual, sexpr.inspect
+  end
+  YAML.load(File.read(DATA_PATH))["tests"].each do |section, tests|
+    context "when testing #{section}" do
+      tests.each do |hash|
+        should hash["description"] do
+          assert_autolink hash["expected"], hash["text"]
+        end
+      end
+    end
+  end
+end

data/test/parser_test.rb CHANGED

@@ -6,12 +6,8 @@ require "tweetparser"
 class ParserTest < Test::Unit::TestCase
-  def setup
-    @parser = TweetContentParser.new
-  end
   def assert_parses(expected, input)
-    actual = @parser.parse(input).content
+    actual = TweetParser.parse(input)
     assert_equal expected, actual
   end
@@ -24,6 +20,21 @@ class ParserTest < Test::Unit::TestCase
     assert_parses [[:url, s]], s
   end
+  should "extract url with www and no http" do
+    s = "www.example.com/mail/?ui=2&shva=1#inbox"
+    assert_parses [[:url, s]], s
+  end
+  should "extract IDN url" do
+    s = "http://✪df.ws/ejp"
+    assert_parses [[:url, s]], s
+  end
+  should "not extract invalid domain" do
+    s = "http://example_com/mail/?ui=2&shva=1#inbox"
+    assert_parses [[:text, s]], s
+  end
   should "extract hashtag" do
     s = "#HashTag2010"
     assert_parses [[:hashtag, s]], s
@@ -31,7 +42,7 @@ class ParserTest < Test::Unit::TestCase
   should "extract at-references" do
     s = "@AtRef_3000"
-    assert_parses [[:atref, s]], s
+    assert_parses [[:username, s]], s
   end
   should "extract HTML" do
@@ -39,9 +50,19 @@ class ParserTest < Test::Unit::TestCase
     assert_parses [[:html, s]], s
   end
+  should "extract a slash comment" do
+    s = %{/via}
+    assert_parses [[:slash, s]], s
+  end
+  should "extract a list" do
+    s = %{@username/list}
+    assert_parses [[:list, s]], s
+  end
   should "extract words spaces and new lines" do
     s = "this string\nhas spaces!"
-    expected = [[:text, "this"], [:space, " "], [:text, "string"], [:newline],
+    expected = [[:text, "this"], [:space, " "], [:text, "string"], [:newline, "\n"],
                 [:text, "has"], [:space, " "], [:text, "spaces!"]]
     assert_parses expected, s
   end
@@ -51,7 +72,7 @@ class ParserTest < Test::Unit::TestCase
     expected = [[:text, "Another"], [:space, " "], [:text, "test:"], [:space, "  "],
                 [:html, "<a href=\"http://twitpic.com/14vzny\" target=\"_blank\">"],
                 [:html, "<img src=\"http://twitpic.com/show/mini/14vzny\" />"],
-                [:html, "</a>"], [:newline],
+                [:html, "</a>"], [:newline, "\n"],
                 [:url, "http://twitpic.com/14vzny"],
                 [:space, " "], [:text, "3"], [:space, " "],
                 [:url, "http://twitpic.com/14vzny"]]
@@ -60,7 +81,7 @@ class ParserTest < Test::Unit::TestCase
   should "extract elements from real-world sample" do
     s = %{RT @newsbrooke Tonight’s the night!: Hope you’ll all tune in tonight to watch On Expenses at 9pm on BBC4 http://bit.ly/cgbkmF #mps #uk}
-    expected = [[:text, "RT"], [:space, " "], [:atref, "@newsbrooke"], [:space, " "],
+    expected = [[:text, "RT"], [:space, " "], [:username, "@newsbrooke"], [:space, " "],
                 [:text, "Tonight’s"], [:space, " "], [:text, "the"], [:space, " "],
                 [:text, "night!:"], [:space, " "], [:text, "Hope"], [:space, " "],
                 [:text, "you’ll"], [:space, " "], [:text, "all"], [:space, " "],

data/test/twitter-text-conformance/README ADDED

@@ -0,0 +1,6 @@
+Yaml files that define the conformance testing for twitter-text-* libraries.
+== TODO
+ * Describe the format in the README
+ * Add more tests (ongoing)

data/test/twitter-text-conformance/autolink.yml ADDED

@@ -0,0 +1,250 @@
+tests:
+  usernames:
+    - description: "Autolink trailing username"
+      text: "text @username"
+      expected: "text @<a class=\"tweet-url username\" href=\"http://twitter.com/username\">username</a>"
+    - description: "Autolink username at the beginning"
+      text: "@username text"
+      expected: "@<a class=\"tweet-url username\" href=\"http://twitter.com/username\">username</a> text"
+    - description: "DO NOT Autolink username preceded by a letter"
+      text: "meet@the beach"
+      expected: "meet@the beach"
+    - description: "Autolink username preceded by puctuation"
+      text: "great.@username"
+      expected: "great.@<a class=\"tweet-url username\" href=\"http://twitter.com/username\">username</a>"
+    - description: "Autolink username followed by puctuation"
+      text: "@username&^$%^"
+      expected: "@<a class=\"tweet-url username\" href=\"http://twitter.com/username\">username</a>&^$%^"
+    - description: "Autolink username followed by Japanese"
+      text: "@usernameの"
+      expected: "@<a class=\"tweet-url username\" href=\"http://twitter.com/username\">username</a>の"
+    - description: "Autolink username preceded by Japanese"
+      text: "あ@username"
+      expected: "あ@<a class=\"tweet-url username\" href=\"http://twitter.com/username\">username</a>"
+    - description: "Autolink username surrounded by Japanese"
+      text: "あ@usernameの"
+      expected: "あ@<a class=\"tweet-url username\" href=\"http://twitter.com/username\">username</a>の"
+    - description: "Autolink username with full-width at sign (U+FF20)"
+      text: "＠username"
+      expected: "＠<a class=\"tweet-url username\" href=\"http://twitter.com/username\">username</a>"
+    - description: "DO NOT Autolink username over 20 characters"
+      text: "@username9012345678901"
+      expected: "@<a class=\"tweet-url username\" href=\"http://twitter.com/username901234567890\">username901234567890</a>1"
+  lists:
+    - description: "Autolink list preceded by a space"
+      text: "text @username/list"
+      expected: "text @<a class=\"tweet-url list-slug\" href=\"http://twitter.com/username/list\">username/list</a>"
+    - description: "DO NOT Autolink list when space follows slash"
+      text: "text @username/ list"
+      expected: "text @<a class=\"tweet-url username\" href=\"http://twitter.com/username\">username</a>/ list"
+    - description: "DO NOT Autolink list with empty username"
+      text: "text @/list"
+      expected: "text @/list"
+    - description: "Autolink list at the beginning"
+      text: "@username/list"
+      expected: "@<a class=\"tweet-url list-slug\" href=\"http://twitter.com/username/list\">username/list</a>"
+    - description: "DO NOT Autolink list preceded by letter"
+      text: "meet@the/beach"
+      expected: "meet@the/beach"
+    - description: "Autolink list preceded by puctuation"
+      text: "great.@username/list"
+      expected: "great.@<a class=\"tweet-url list-slug\" href=\"http://twitter.com/username/list\">username/list</a>"
+    - description: "Autolink list followed by puctuation"
+      text: "@username/list&^$%^"
+      expected: "@<a class=\"tweet-url list-slug\" href=\"http://twitter.com/username/list\">username/list</a>&^$%^"
+    - description: "Autolink list name over 80 characters (truncated to 80)"
+      text: "@username/list5678901234567890123456789012345678901234567890123456789012345678901234567890A"
+      expected: "@<a class=\"tweet-url list-slug\" href=\"http://twitter.com/username/list5678901234567890123456789012345678901234567890123456789012345678901234567890\">username/list5678901234567890123456789012345678901234567890123456789012345678901234567890</a>A"
+  hashtags:
+    - description: "Autolink trailing hashtag"
+      text: "text #hashtag"
+      expected: "text <a href=\"http://twitter.com/search?q=%23hashtag\" title=\"#hashtag\" class=\"tweet-url hashtag\">#hashtag</a>"
+    - description: "Autolink alphanumeric hashtag (letter-number-letter)"
+      text: "text #hash0tag"
+      expected: "text <a href=\"http://twitter.com/search?q=%23hash0tag\" title=\"#hash0tag\" class=\"tweet-url hashtag\">#hash0tag</a>"
+    - description: "Autolink alphanumeric hashtag (number-letter)"
+      text: "text #1tag"
+      expected: "text <a href=\"http://twitter.com/search?q=%231tag\" title=\"#1tag\" class=\"tweet-url hashtag\">#1tag</a>"
+    - description: "Autolink hashtag with underscore"
+      text: "text #hash_tag"
+      expected: "text <a href=\"http://twitter.com/search?q=%23hash_tag\" title=\"#hash_tag\" class=\"tweet-url hashtag\">#hash_tag</a>"
+    - description: "DO NOT Autolink all-numeric hashtags"
+      text: "text #1234"
+      expected: "text #1234"
+    - description: "DO NOT Autolink hashtag preceded by a letter"
+      text: "text#hashtag"
+      expected: "text#hashtag"
+    - description: "Autolink multiple hashtags"
+      text: "text #hashtag1 #hashtag2"
+      expected: "text <a href=\"http://twitter.com/search?q=%23hashtag1\" title=\"#hashtag1\" class=\"tweet-url hashtag\">#hashtag1</a> <a href=\"http://twitter.com/search?q=%23hashtag2\" title=\"#hashtag2\" class=\"tweet-url hashtag\">#hashtag2</a>"
+    - description: "Autolink hashtag preceded by a period"
+      text: "text.#hashtag"
+      expected: "text.<a href=\"http://twitter.com/search?q=%23hashtag\" title=\"#hashtag\" class=\"tweet-url hashtag\">#hashtag</a>"
+    - description: "DO NOT Autolink hashtag preceded by &"
+      text: "&#nbsp;"
+      expected: "&#nbsp;"
+    - description: "Autolink hashtag followed by ! (! not included)"
+      text: "text #hashtag!"
+      expected: "text <a href=\"http://twitter.com/search?q=%23hashtag\" title=\"#hashtag\" class=\"tweet-url hashtag\">#hashtag</a>!"
+    - description: "Autolink hashtag followed by Japanese"
+      text: "text #hashtagの"
+      expected: "text <a href=\"http://twitter.com/search?q=%23hashtag\" title=\"#hashtag\" class=\"tweet-url hashtag\">#hashtag</a>の"
+    - description: "Autolink hashtag preceded by full-width space (U+3000)"
+      text: "text　#hashtag"
+      expected: "text　<a href=\"http://twitter.com/search?q=%23hashtag\" title=\"#hashtag\" class=\"tweet-url hashtag\">#hashtag</a>"
+    - description: "Autolink hashtag followed by full-width space (U+3000)"
+      text: "#hashtag　text"
+      expected: "<a href=\"http://twitter.com/search?q=%23hashtag\" title=\"#hashtag\" class=\"tweet-url hashtag\">#hashtag</a>　text"
+    - description: "Autolink hashtag with full-width hash (U+FF03)"
+      text: "＃hashtag"
+      expected: "<a href=\"http://twitter.com/search?q=%23hashtag\" title=\"#hashtag\" class=\"tweet-url hashtag\">＃hashtag</a>"
+  urls:
+    - description: "Autolink trailing url"
+      text: "text http://example.com"
+      expected: "text <a href=\"http://example.com\">http://example.com</a>"
+    - description: "Autolink url in mid-text"
+      text: "text http://example.com more text"
+      expected: "text <a href=\"http://example.com\">http://example.com</a> more text"
+    - description: "Autolink url in Japanese text"
+      text: "いまなにしてるhttp://example.comいまなにしてる"
+      expected: "いまなにしてる<a href=\"http://example.com\">http://example.com</a>いまなにしてる"
+    - description: "Autolink url surrounded by parentheses"
+      text: "text (http://example.com)"
+      expected: "text (<a href=\"http://example.com\">http://example.com</a>)"
+    - description: "Autolink url containing unicode characters"
+      text: "I enjoy Macintosh Brand computers: http://✪df.ws/ejp"
+      expected: "I enjoy Macintosh Brand computers: <a href=\"http://✪df.ws/ejp\">http://✪df.ws/ejp</a>"
+    - description: "DO NOT Autolink url containing ! character in the domain"
+      text: "badly formatted http://foo!bar.com"
+      expected: "badly formatted http://foo!bar.com"
+    - description: "DO NOT Autolink url containing _ character in the domain"
+      text: "badly formatted http://foo_bar.com"
+      expected: "badly formatted http://foo_bar.com"
+    - description: "Autolink url preceded by :"
+      text: "text:http://example.com"
+      expected: "text:<a href=\"http://example.com\">http://example.com</a>"
+    - description: "Autolink url followed by ? (without it)"
+      text: "text http://example.com?"
+      expected: "text <a href=\"http://example.com\">http://example.com</a>?"
+    - description: "Autolink url followed by ! (without it)"
+      text: "text http://example.com!"
+      expected: "text <a href=\"http://example.com\">http://example.com</a>!"
+    - description: "Autolink url followed by , (without it)"
+      text: "text http://example.com,"
+      expected: "text <a href=\"http://example.com\">http://example.com</a>,"
+    - description: "Autolink url followed by . (without it)"
+      text: "text http://example.com."
+      expected: "text <a href=\"http://example.com\">http://example.com</a>."
+    - description: "Autolink url followed by : (without it)"
+      text: "text http://example.com:"
+      expected: "text <a href=\"http://example.com\">http://example.com</a>:"
+    - description: "Autolink url followed by ; (without it)"
+      text: "text http://example.com;"
+      expected: "text <a href=\"http://example.com\">http://example.com</a>;"
+    - description: "Autolink url followed by ] (without it)"
+      text: "text http://example.com]"
+      expected: "text <a href=\"http://example.com\">http://example.com</a>]"
+    - description: "Autolink url followed by ) (without it)"
+      text: "text http://example.com)"
+      expected: "text <a href=\"http://example.com\">http://example.com</a>)"
+    - description: "Autolink url followed by } (without it)"
+      text: "text http://example.com}"
+      expected: "text <a href=\"http://example.com\">http://example.com</a>}"
+    - description: "Autolink url followed by = (without it)"
+      text: "text http://example.com="
+      expected: "text <a href=\"http://example.com\">http://example.com</a>="
+    - description: "Autolink url followed by ' (without it)"
+      text: "text http://example.com'"
+      expected: "text <a href=\"http://example.com\">http://example.com</a>'"
+    - description: "DO NOT Autolink url preceded by '"
+      text: "text 'http://example.com"
+      expected: "text 'http://example.com"
+    - description: "DO NOT Autolink url preceded by /"
+      text: "text /http://example.com"
+      expected: "text /http://example.com"
+    - description: "DO NOT Autolink url preceded by !"
+      text: "text !http://example.com"
+      expected: "text !http://example.com"
+    - description: "DO NOT Autolink url preceded by ="
+      text: "text =http://example.com"
+      expected: "text =http://example.com"
+    - description: "Autolink url embedded in link tag"
+      text: "<link rel='true'>http://example.com</link>"
+      expected: "<link rel='true'><a href=\"http://example.com\">http://example.com</a></link>"
+    - description: "Autolink multiple urls"
+      text: "http://example.com https://sslexample.com http://sub.example.com"
+      expected: "<a href=\"http://example.com\">http://example.com</a> <a href=\"https://sslexample.com\">https://sslexample.com</a> <a href=\"http://sub.example.com\">http://sub.example.com</a>"
+    - description: "Autolink url with long TLD"
+      text: "http://example.mobi/path"
+      expected: "<a href=\"http://example.mobi/path\">http://example.mobi/path</a>"
+    - description: "Autolink url without protocol (with www)"
+      text: "www.example.com"
+      expected: "<a href=\"http://www.example.com\">www.example.com</a>"
+    - description: "Autolink url without protocol (with WWW)"
+      text: "WWW.EXAMPLE.COM"
+      expected: "<a href=\"http://WWW.EXAMPLE.COM\">WWW.EXAMPLE.COM</a>"
+  all:
+    - description: "Autolink url does not overlap @username"
+      text: "Check out: http://example.com/test&@chasesechrist"
+      expected: "Check out: <a href=\"http://example.com/test&\">http://example.com/test&</a>@<a class=\"tweet-url username\" href=\"http://twitter.com/chasesechrist\">chasesechrist</a>"

data/test/twitter-text-conformance/extract.yml ADDED

@@ -0,0 +1,193 @@
+tests:
+  mentions:
+    - description: "Extract mention at the begining of a tweet"
+      text: "@username reply"
+      expected: ["username"]
+    - description: "Extract mention at the end of a tweet"
+      text: "mention @username"
+      expected: ["username"]
+    - description: "Extract mention in the middle of a tweet"
+      text: "mention @username in the middle"
+      expected: ["username"]
+    - description: "Extract mention of username with underscore"
+      text: "mention @user_name"
+      expected: ["user_name"]
+    - description: "Extract mention of all numeric username"
+      text: "mention @12345"
+      expected: ["12345"]
+    - description: "Extract mention or multiple usernames"
+      text: "mention @username1 @username2"
+      expected: ["username1", "username2"]
+    - description: "Extract mention in the middle of a Japanese tweet"
+      text: "の@usernameに到着を待っている"
+      expected: ["username"]
+    - description: "DO NOT extract username ending in @"
+      text: "Current Status: @_@ (cc: @username)"
+      expected: ["username"]
+    - description: "Extract lone metion but not @user@user (too close to an email)"
+      text: "@username email me @test@example.com"
+      expected: ["username"]
+  replies:
+    - description: "Extract reply at the begining of a tweet"
+      text: "@username reply"
+      expected: "username"
+    - description: "Extract reply preceded by only a space"
+      text: " @username reply"
+      expected: "username"
+    - description: "Extract reply preceded by only a full-width space (U+3000)"
+      text: "　@username reply"
+      expected: "username"
+    - description: "DO NOT Extract reply when preceded by text"
+      text: "a @username mention, not a reply"
+      expected:
+    - description: "DO NOT Extract reply when preceded by ."
+      text: ".@username mention, not a reply"
+      expected:
+    - description: "DO NOT Extract reply when preceded by /"
+      text: "/@username mention, not a reply"
+      expected:
+    - description: "DO NOT Extract reply when preceded by _"
+      text: "_@username mention, not a reply"
+      expected:
+    - description: "DO NOT Extract reply when preceded by -"
+      text: "-@username mention, not a reply"
+      expected:
+    - description: "DO NOT Extract reply when preceded by +"
+      text: "+@username mention, not a reply"
+      expected:
+    - description: "DO NOT Extract reply when preceded by #"
+      text: "#@username mention, not a reply"
+      expected:
+    - description: "DO NOT Extract reply when preceded by !"
+      text: "!@username mention, not a reply"
+      expected:
+    - description: "DO NOT Extract reply when preceded by @"
+      text: "@@username mention, not a reply"
+      expected:
+  urls:
+    - description: "Extract a lone URL"
+      text: "http://example.com"
+      expected: ["http://example.com"]
+    - description: "Extract valid URL: http://google.com"
+      text: "text http://google.com"
+      expected: ["http://google.com"]
+    - description: "Extract valid URL: http://foobar.com/#"
+      text: "text http://foobar.com/#"
+      expected: ["http://foobar.com/#"]
+    - description: "Extract valid URL: http://google.com/#foo"
+      text: "text http://google.com/#foo"
+      expected: ["http://google.com/#foo"]
+    - description: "Extract valid URL: http://google.com/#search?q=iphone%20-filter%3Alinks"
+      text: "text http://google.com/#search?q=iphone%20-filter%3Alinks"
+      expected: ["http://google.com/#search?q=iphone%20-filter%3Alinks"]
+    - description: "Extract valid URL: http://twitter.com/#search?q=iphone%20-filter%3Alinks"
+      text: "text http://twitter.com/#search?q=iphone%20-filter%3Alinks"
+      expected: ["http://twitter.com/#search?q=iphone%20-filter%3Alinks"]
+    - description: "Extract valid URL: http://www.boingboing.net/2007/02/14/katamari_damacy_phon.html"
+      text: "text http://www.boingboing.net/2007/02/14/katamari_damacy_phon.html"
+      expected: ["http://www.boingboing.net/2007/02/14/katamari_damacy_phon.html"]
+    - description: "Extract valid URL: http://somehost.com:3000"
+      text: "text http://somehost.com:3000"
+      expected: ["http://somehost.com:3000"]
+    - description: "Extract valid URL: http://x.com/~matthew+%-x"
+      text: "text http://x.com/~matthew+%-x"
+      expected: ["http://x.com/~matthew+%-x"]
+    - description: "Extract valid URL: http://x.com/~matthew+%-,.;x"
+      text: "text http://x.com/~matthew+%-,.;x"
+      expected: ["http://x.com/~matthew+%-,.;x"]
+    - description: "Extract valid URL: http://x.com/,.;x"
+      text: "text http://x.com/,.;x"
+      expected: ["http://x.com/,.;x"]
+    - description: "Extract valid URL: http://en.wikipedia.org/wiki/Primer_(film)"
+      text: "text http://en.wikipedia.org/wiki/Primer_(film)"
+      expected: ["http://en.wikipedia.org/wiki/Primer_(film)"]
+    - description: "Extract valid URL: http://www.ams.org/bookstore-getitem/item=mbk-59"
+      text: "text http://www.ams.org/bookstore-getitem/item=mbk-59"
+      expected: ["http://www.ams.org/bookstore-getitem/item=mbk-59"]
+    - description: "Extract valid URL: http://✪df.ws/ejp"
+      text: "text http://✪df.ws/ejp"
+      expected: ["http://✪df.ws/ejp"]
+    - description: "Extract valid URL: http://chilp.it/?77e8fd"
+      text: "text http://chilp.it/?77e8fd"
+      expected: ["http://chilp.it/?77e8fd"]
+    - description: "DO NOT extract invalid URL: http://doman-dash_2314352345_dfasd.foo-cow_4352.com"
+      text: "text http://doman-dash_2314352345_dfasd.foo-cow_4352.com"
+      expected: []
+    - description: "DO NOT extract invalid URL: http://no-tld"
+      text: "text http://no-tld"
+      expected: []
+    - description: "DO NOT extract invalid URL: http://tld-too-short.x"
+      text: "text http://tld-too-short.x"
+      expected: []
+  hashtags:
+    - description: "Extract an all-alpha hashtag"
+      text: "a #hashtag here"
+      expected: ["hashtag"]
+    - description: "Extract a letter-then-number hashtag"
+      text: "this is #hashtag1"
+      expected: ["hashtag1"]
+    - description: "Extract a number-then-letter hashtag"
+      text: "#1hashtag is this"
+      expected: ["1hashtag"]
+    - description: "DO NOT Extract an all-numeric hashtag"
+      text: "On the #16 bus"
+      expected: []
+    - description: "Extract a hashtag containing ñ"
+      text: "I'll write more tests #mañana"
+      expected: ["mañana"]
+    - description: "Extract a hashtag containing é"
+      text: "Working remotely #café"
+      expected: ["café"]
+    - description: "Extract a hashtag containing ü"
+      text: "Getting my Oktoberfest on #münchen"
+      expected: ["münchen"]
+    - description: "DO NOT Extract a hashtag containing Japanese"
+      text: "this is not valid: # 会議中 ハッシュ"
+      expected: []

metadata CHANGED

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: tweetparser
 version: !ruby/object:Gem::Version
-  version: 0.1.0
+  version: 0.2.0
 platform: ruby
 authors:
 - Paul Battley
@@ -9,7 +9,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2010-02-23 00:00:00 +00:00
+date: 2010-02-24 00:00:00 +00:00
 default_executable:
 dependencies:
 - !ruby/object:Gem::Dependency
@@ -51,7 +51,11 @@ extensions: []
 extra_rdoc_files: []
 files:
+- test/conformance_test.rb
 - test/parser_test.rb
+- test/twitter-text-conformance/autolink.yml
+- test/twitter-text-conformance/extract.yml
+- test/twitter-text-conformance/README
 - lib/tweetparser/grammar.treetop
 - lib/tweetparser.rb
 has_rdoc: true