tweetparser 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,3 @@
1
+ require "treetop"
2
+ require "polyglot"
3
+ require "tweetparser/grammar"
@@ -0,0 +1,65 @@
1
+ grammar TweetContent
2
+ rule tweet
3
+ (url / html / space / newline / atref / hashtag / text)* {
4
+ def content
5
+ elements.map{ |e| e.content }
6
+ end
7
+ }
8
+ end
9
+
10
+ rule url
11
+ "http" "s"? "://" [\./a-zA-Z0-9\?#=\-_&%]+ {
12
+ def content
13
+ [:url, text_value]
14
+ end
15
+ }
16
+ end
17
+
18
+ rule atref
19
+ "@" [a-zA-Z0-9_]+ {
20
+ def content
21
+ [:atref, text_value]
22
+ end
23
+ }
24
+ end
25
+
26
+ rule hashtag
27
+ "#" [a-zA-Z0-9_]+ {
28
+ def content
29
+ [:hashtag, text_value]
30
+ end
31
+ }
32
+ end
33
+
34
+ rule text
35
+ ([^h\s] / "h" !("ttp" "s"? "://"))+ {
36
+ def content
37
+ [:text, text_value]
38
+ end
39
+ }
40
+ end
41
+
42
+ rule html
43
+ "<" [^>]+ ">" {
44
+ def content
45
+ [:html, text_value]
46
+ end
47
+ }
48
+ end
49
+
50
+ rule newline
51
+ "\r"? "\n" {
52
+ def content
53
+ [:newline]
54
+ end
55
+ }
56
+ end
57
+
58
+ rule space
59
+ " "+ {
60
+ def content
61
+ [:space, text_value]
62
+ end
63
+ }
64
+ end
65
+ end
@@ -0,0 +1,78 @@
1
+ # encoding: UTF-8
2
+ $:.unshift(File.expand_path("../../lib", __FILE__))
3
+ require "test/unit"
4
+ require "shoulda"
5
+ require "tweetparser"
6
+
7
+ class ParserTest < Test::Unit::TestCase
8
+
9
+ def setup
10
+ @parser = TweetContentParser.new
11
+ end
12
+
13
+ def assert_parses(expected, input)
14
+ actual = @parser.parse(input).content
15
+ assert_equal expected, actual
16
+ end
17
+
18
+ should "parse a blank string" do
19
+ assert_parses [], ""
20
+ end
21
+
22
+ should "extract url with query string and target" do
23
+ s = "https://mail.google.com/mail/?ui=2&shva=1#inbox"
24
+ assert_parses [[:url, s]], s
25
+ end
26
+
27
+ should "extract hashtag" do
28
+ s = "#HashTag2010"
29
+ assert_parses [[:hashtag, s]], s
30
+ end
31
+
32
+ should "extract at-references" do
33
+ s = "@AtRef_3000"
34
+ assert_parses [[:atref, s]], s
35
+ end
36
+
37
+ should "extract HTML" do
38
+ s = %{<some tag with="http://href.com/">}
39
+ assert_parses [[:html, s]], s
40
+ end
41
+
42
+ should "extract words spaces and new lines" do
43
+ s = "this string\nhas spaces!"
44
+ expected = [[:text, "this"], [:space, " "], [:text, "string"], [:newline],
45
+ [:text, "has"], [:space, " "], [:text, "spaces!"]]
46
+ assert_parses expected, s
47
+ end
48
+
49
+ should "extract everything from sample tweet" do
50
+ s = %{Another test: <a href="http://twitpic.com/14vzny" target="_blank"><img src="http://twitpic.com/show/mini/14vzny" /></a>\nhttp://twitpic.com/14vzny 3 http://twitpic.com/14vzny}
51
+ expected = [[:text, "Another"], [:space, " "], [:text, "test:"], [:space, " "],
52
+ [:html, "<a href=\"http://twitpic.com/14vzny\" target=\"_blank\">"],
53
+ [:html, "<img src=\"http://twitpic.com/show/mini/14vzny\" />"],
54
+ [:html, "</a>"], [:newline],
55
+ [:url, "http://twitpic.com/14vzny"],
56
+ [:space, " "], [:text, "3"], [:space, " "],
57
+ [:url, "http://twitpic.com/14vzny"]]
58
+ assert_parses expected, s
59
+ end
60
+
61
+ should "extract elements from real-world sample" do
62
+ s = %{RT @newsbrooke Tonight’s the night!: Hope you’ll all tune in tonight to watch On Expenses at 9pm on BBC4 http://bit.ly/cgbkmF #mps #uk}
63
+ expected = [[:text, "RT"], [:space, " "], [:atref, "@newsbrooke"], [:space, " "],
64
+ [:text, "Tonight’s"], [:space, " "], [:text, "the"], [:space, " "],
65
+ [:text, "night!:"], [:space, " "], [:text, "Hope"], [:space, " "],
66
+ [:text, "you’ll"], [:space, " "], [:text, "all"], [:space, " "],
67
+ [:text, "tune"], [:space, " "], [:text, "in"], [:space, " "],
68
+ [:text, "tonight"], [:space, " "], [:text, "to"], [:space, " "],
69
+ [:text, "watch"], [:space, " "], [:text, "On"], [:space, " "],
70
+ [:text, "Expenses"], [:space, " "], [:text, "at"], [:space, " "],
71
+ [:text, "9pm"], [:space, " "], [:text, "on"], [:space, " "],
72
+ [:text, "BBC4"], [:space, " "], [:url, "http://bit.ly/cgbkmF"], [:space, " "],
73
+ [:hashtag, "#mps"], [:space, " "], [:hashtag, "#uk"]]
74
+ assert_parses expected, s
75
+ end
76
+
77
+ end
78
+
metadata ADDED
@@ -0,0 +1,86 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: tweetparser
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Paul Battley
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2010-02-23 00:00:00 +00:00
13
+ default_executable:
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: treetop
17
+ type: :runtime
18
+ version_requirement:
19
+ version_requirements: !ruby/object:Gem::Requirement
20
+ requirements:
21
+ - - ~>
22
+ - !ruby/object:Gem::Version
23
+ version: 1.4.2
24
+ version:
25
+ - !ruby/object:Gem::Dependency
26
+ name: polyglot
27
+ type: :runtime
28
+ version_requirement:
29
+ version_requirements: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ~>
32
+ - !ruby/object:Gem::Version
33
+ version: 0.2.9
34
+ version:
35
+ - !ruby/object:Gem::Dependency
36
+ name: shoulda
37
+ type: :development
38
+ version_requirement:
39
+ version_requirements: !ruby/object:Gem::Requirement
40
+ requirements:
41
+ - - ">="
42
+ - !ruby/object:Gem::Version
43
+ version: "0"
44
+ version:
45
+ description:
46
+ email: pbattley@gmail.com
47
+ executables: []
48
+
49
+ extensions: []
50
+
51
+ extra_rdoc_files: []
52
+
53
+ files:
54
+ - test/parser_test.rb
55
+ - lib/tweetparser/grammar.treetop
56
+ - lib/tweetparser.rb
57
+ has_rdoc: true
58
+ homepage: http://github.com/madebymany/tweetparser
59
+ licenses: []
60
+
61
+ post_install_message:
62
+ rdoc_options: []
63
+
64
+ require_paths:
65
+ - lib
66
+ required_ruby_version: !ruby/object:Gem::Requirement
67
+ requirements:
68
+ - - ">="
69
+ - !ruby/object:Gem::Version
70
+ version: "0"
71
+ version:
72
+ required_rubygems_version: !ruby/object:Gem::Requirement
73
+ requirements:
74
+ - - ">="
75
+ - !ruby/object:Gem::Version
76
+ version: "0"
77
+ version:
78
+ requirements: []
79
+
80
+ rubyforge_project:
81
+ rubygems_version: 1.3.5
82
+ signing_key:
83
+ specification_version: 3
84
+ summary: Extract content from tweets
85
+ test_files: []
86
+