RubyGems - tweetparser - Versions diffs - 0.1.0 - Mend

tweetparser 0.1.0

Files changed (4) hide show

data/lib/tweetparser.rb +3 -0
data/lib/tweetparser/grammar.treetop +65 -0
data/test/parser_test.rb +78 -0
metadata +86 -0

data/lib/tweetparser.rb ADDED Viewed

@@ -0,0 +1,3 @@
+require "treetop"
+require "polyglot"
+require "tweetparser/grammar"

data/lib/tweetparser/grammar.treetop ADDED Viewed

@@ -0,0 +1,65 @@
+grammar TweetContent
+  rule tweet
+    (url / html / space / newline / atref / hashtag / text)* {
+      def content
+        elements.map{ |e| e.content }
+      end
+    }
+  end
+  rule url
+    "http" "s"? "://" [\./a-zA-Z0-9\?#=\-_&%]+ {
+      def content
+        [:url, text_value]
+      end
+    }
+  end
+  rule atref
+    "@" [a-zA-Z0-9_]+ {
+      def content
+        [:atref, text_value]
+      end
+    }
+  end
+  rule hashtag
+    "#" [a-zA-Z0-9_]+ {
+      def content
+        [:hashtag, text_value]
+      end
+    }
+  end
+  rule text
+    ([^h\s] / "h" !("ttp" "s"? "://"))+ {
+      def content
+        [:text, text_value]
+      end
+    }
+  end
+  rule html
+    "<" [^>]+ ">" {
+      def content
+        [:html, text_value]
+      end
+    }
+  end
+  rule newline
+    "\r"? "\n" {
+      def content
+        [:newline]
+      end
+    }
+  end
+  rule space
+    " "+ {
+      def content
+        [:space, text_value]
+      end
+    }
+  end
+end

data/test/parser_test.rb ADDED Viewed

@@ -0,0 +1,78 @@
+# encoding: UTF-8
+$:.unshift(File.expand_path("../../lib", __FILE__))
+require "test/unit"
+require "shoulda"
+require "tweetparser"
+class ParserTest < Test::Unit::TestCase
+  def setup
+    @parser = TweetContentParser.new
+  end
+  def assert_parses(expected, input)
+    actual = @parser.parse(input).content
+    assert_equal expected, actual
+  end
+  should "parse a blank string" do
+    assert_parses [], ""
+  end
+  should "extract url with query string and target" do
+    s = "https://mail.google.com/mail/?ui=2&shva=1#inbox"
+    assert_parses [[:url, s]], s
+  end
+  should "extract hashtag" do
+    s = "#HashTag2010"
+    assert_parses [[:hashtag, s]], s
+  end
+  should "extract at-references" do
+    s = "@AtRef_3000"
+    assert_parses [[:atref, s]], s
+  end
+  should "extract HTML" do
+    s = %{<some tag with="http://href.com/">}
+    assert_parses [[:html, s]], s
+  end
+  should "extract words spaces and new lines" do
+    s = "this string\nhas spaces!"
+    expected = [[:text, "this"], [:space, " "], [:text, "string"], [:newline],
+                [:text, "has"], [:space, " "], [:text, "spaces!"]]
+    assert_parses expected, s
+  end
+  should "extract everything from sample tweet" do
+    s = %{Another test:  <a href="http://twitpic.com/14vzny" target="_blank"><img src="http://twitpic.com/show/mini/14vzny" /></a>\nhttp://twitpic.com/14vzny 3 http://twitpic.com/14vzny}
+    expected = [[:text, "Another"], [:space, " "], [:text, "test:"], [:space, "  "],
+                [:html, "<a href=\"http://twitpic.com/14vzny\" target=\"_blank\">"],
+                [:html, "<img src=\"http://twitpic.com/show/mini/14vzny\" />"],
+                [:html, "</a>"], [:newline],
+                [:url, "http://twitpic.com/14vzny"],
+                [:space, " "], [:text, "3"], [:space, " "],
+                [:url, "http://twitpic.com/14vzny"]]
+    assert_parses expected, s
+  end
+  should "extract elements from real-world sample" do
+    s = %{RT @newsbrooke Tonight’s the night!: Hope you’ll all tune in tonight to watch On Expenses at 9pm on BBC4 http://bit.ly/cgbkmF #mps #uk}
+    expected = [[:text, "RT"], [:space, " "], [:atref, "@newsbrooke"], [:space, " "],
+                [:text, "Tonight’s"], [:space, " "], [:text, "the"], [:space, " "],
+                [:text, "night!:"], [:space, " "], [:text, "Hope"], [:space, " "],
+                [:text, "you’ll"], [:space, " "], [:text, "all"], [:space, " "],
+                [:text, "tune"], [:space, " "], [:text, "in"], [:space, " "],
+                [:text, "tonight"], [:space, " "], [:text, "to"], [:space, " "],
+                [:text, "watch"], [:space, " "], [:text, "On"], [:space, " "],
+                [:text, "Expenses"], [:space, " "], [:text, "at"], [:space, " "],
+                [:text, "9pm"], [:space, " "], [:text, "on"], [:space, " "],
+                [:text, "BBC4"], [:space, " "], [:url, "http://bit.ly/cgbkmF"], [:space, " "],
+                [:hashtag, "#mps"], [:space, " "], [:hashtag, "#uk"]]
+    assert_parses expected, s
+  end
+end

metadata ADDED Viewed

@@ -0,0 +1,86 @@
+--- !ruby/object:Gem::Specification
+name: tweetparser
+version: !ruby/object:Gem::Version
+  version: 0.1.0
+platform: ruby
+authors:
+- Paul Battley
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2010-02-23 00:00:00 +00:00
+default_executable:
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: treetop
+  type: :runtime
+  version_requirement:
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: 1.4.2
+    version:
+- !ruby/object:Gem::Dependency
+  name: polyglot
+  type: :runtime
+  version_requirement:
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: 0.2.9
+    version:
+- !ruby/object:Gem::Dependency
+  name: shoulda
+  type: :development
+  version_requirement:
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: "0"
+    version:
+description:
+email: pbattley@gmail.com
+executables: []
+extensions: []
+extra_rdoc_files: []
+files:
+- test/parser_test.rb
+- lib/tweetparser/grammar.treetop
+- lib/tweetparser.rb
+has_rdoc: true
+homepage: http://github.com/madebymany/tweetparser
+licenses: []
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: "0"
+  version:
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: "0"
+  version:
+requirements: []
+rubyforge_project:
+rubygems_version: 1.3.5
+signing_key:
+specification_version: 3
+summary: Extract content from tweets
+test_files: []