tweetparser 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,3 @@
1
+ require "treetop"
2
+ require "polyglot"
3
+ require "tweetparser/grammar"
@@ -0,0 +1,65 @@
1
+ grammar TweetContent
2
+ rule tweet
3
+ (url / html / space / newline / atref / hashtag / text)* {
4
+ def content
5
+ elements.map{ |e| e.content }
6
+ end
7
+ }
8
+ end
9
+
10
+ rule url
11
+ "http" "s"? "://" [\./a-zA-Z0-9\?#=\-_&%]+ {
12
+ def content
13
+ [:url, text_value]
14
+ end
15
+ }
16
+ end
17
+
18
+ rule atref
19
+ "@" [a-zA-Z0-9_]+ {
20
+ def content
21
+ [:atref, text_value]
22
+ end
23
+ }
24
+ end
25
+
26
+ rule hashtag
27
+ "#" [a-zA-Z0-9_]+ {
28
+ def content
29
+ [:hashtag, text_value]
30
+ end
31
+ }
32
+ end
33
+
34
+ rule text
35
+ ([^h\s] / "h" !("ttp" "s"? "://"))+ {
36
+ def content
37
+ [:text, text_value]
38
+ end
39
+ }
40
+ end
41
+
42
+ rule html
43
+ "<" [^>]+ ">" {
44
+ def content
45
+ [:html, text_value]
46
+ end
47
+ }
48
+ end
49
+
50
+ rule newline
51
+ "\r"? "\n" {
52
+ def content
53
+ [:newline]
54
+ end
55
+ }
56
+ end
57
+
58
+ rule space
59
+ " "+ {
60
+ def content
61
+ [:space, text_value]
62
+ end
63
+ }
64
+ end
65
+ end
@@ -0,0 +1,78 @@
1
+ # encoding: UTF-8
2
+ $:.unshift(File.expand_path("../../lib", __FILE__))
3
+ require "test/unit"
4
+ require "shoulda"
5
+ require "tweetparser"
6
+
7
+ class ParserTest < Test::Unit::TestCase
8
+
9
+ def setup
10
+ @parser = TweetContentParser.new
11
+ end
12
+
13
+ def assert_parses(expected, input)
14
+ actual = @parser.parse(input).content
15
+ assert_equal expected, actual
16
+ end
17
+
18
+ should "parse a blank string" do
19
+ assert_parses [], ""
20
+ end
21
+
22
+ should "extract url with query string and target" do
23
+ s = "https://mail.google.com/mail/?ui=2&shva=1#inbox"
24
+ assert_parses [[:url, s]], s
25
+ end
26
+
27
+ should "extract hashtag" do
28
+ s = "#HashTag2010"
29
+ assert_parses [[:hashtag, s]], s
30
+ end
31
+
32
+ should "extract at-references" do
33
+ s = "@AtRef_3000"
34
+ assert_parses [[:atref, s]], s
35
+ end
36
+
37
+ should "extract HTML" do
38
+ s = %{<some tag with="http://href.com/">}
39
+ assert_parses [[:html, s]], s
40
+ end
41
+
42
+ should "extract words spaces and new lines" do
43
+ s = "this string\nhas spaces!"
44
+ expected = [[:text, "this"], [:space, " "], [:text, "string"], [:newline],
45
+ [:text, "has"], [:space, " "], [:text, "spaces!"]]
46
+ assert_parses expected, s
47
+ end
48
+
49
+ should "extract everything from sample tweet" do
50
+ s = %{Another test: <a href="http://twitpic.com/14vzny" target="_blank"><img src="http://twitpic.com/show/mini/14vzny" /></a>\nhttp://twitpic.com/14vzny 3 http://twitpic.com/14vzny}
51
+ expected = [[:text, "Another"], [:space, " "], [:text, "test:"], [:space, " "],
52
+ [:html, "<a href=\"http://twitpic.com/14vzny\" target=\"_blank\">"],
53
+ [:html, "<img src=\"http://twitpic.com/show/mini/14vzny\" />"],
54
+ [:html, "</a>"], [:newline],
55
+ [:url, "http://twitpic.com/14vzny"],
56
+ [:space, " "], [:text, "3"], [:space, " "],
57
+ [:url, "http://twitpic.com/14vzny"]]
58
+ assert_parses expected, s
59
+ end
60
+
61
+ should "extract elements from real-world sample" do
62
+ s = %{RT @newsbrooke Tonight’s the night!: Hope you’ll all tune in tonight to watch On Expenses at 9pm on BBC4 http://bit.ly/cgbkmF #mps #uk}
63
+ expected = [[:text, "RT"], [:space, " "], [:atref, "@newsbrooke"], [:space, " "],
64
+ [:text, "Tonight’s"], [:space, " "], [:text, "the"], [:space, " "],
65
+ [:text, "night!:"], [:space, " "], [:text, "Hope"], [:space, " "],
66
+ [:text, "you’ll"], [:space, " "], [:text, "all"], [:space, " "],
67
+ [:text, "tune"], [:space, " "], [:text, "in"], [:space, " "],
68
+ [:text, "tonight"], [:space, " "], [:text, "to"], [:space, " "],
69
+ [:text, "watch"], [:space, " "], [:text, "On"], [:space, " "],
70
+ [:text, "Expenses"], [:space, " "], [:text, "at"], [:space, " "],
71
+ [:text, "9pm"], [:space, " "], [:text, "on"], [:space, " "],
72
+ [:text, "BBC4"], [:space, " "], [:url, "http://bit.ly/cgbkmF"], [:space, " "],
73
+ [:hashtag, "#mps"], [:space, " "], [:hashtag, "#uk"]]
74
+ assert_parses expected, s
75
+ end
76
+
77
+ end
78
+
metadata ADDED
@@ -0,0 +1,86 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: tweetparser
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Paul Battley
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2010-02-23 00:00:00 +00:00
13
+ default_executable:
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: treetop
17
+ type: :runtime
18
+ version_requirement:
19
+ version_requirements: !ruby/object:Gem::Requirement
20
+ requirements:
21
+ - - ~>
22
+ - !ruby/object:Gem::Version
23
+ version: 1.4.2
24
+ version:
25
+ - !ruby/object:Gem::Dependency
26
+ name: polyglot
27
+ type: :runtime
28
+ version_requirement:
29
+ version_requirements: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ~>
32
+ - !ruby/object:Gem::Version
33
+ version: 0.2.9
34
+ version:
35
+ - !ruby/object:Gem::Dependency
36
+ name: shoulda
37
+ type: :development
38
+ version_requirement:
39
+ version_requirements: !ruby/object:Gem::Requirement
40
+ requirements:
41
+ - - ">="
42
+ - !ruby/object:Gem::Version
43
+ version: "0"
44
+ version:
45
+ description:
46
+ email: pbattley@gmail.com
47
+ executables: []
48
+
49
+ extensions: []
50
+
51
+ extra_rdoc_files: []
52
+
53
+ files:
54
+ - test/parser_test.rb
55
+ - lib/tweetparser/grammar.treetop
56
+ - lib/tweetparser.rb
57
+ has_rdoc: true
58
+ homepage: http://github.com/madebymany/tweetparser
59
+ licenses: []
60
+
61
+ post_install_message:
62
+ rdoc_options: []
63
+
64
+ require_paths:
65
+ - lib
66
+ required_ruby_version: !ruby/object:Gem::Requirement
67
+ requirements:
68
+ - - ">="
69
+ - !ruby/object:Gem::Version
70
+ version: "0"
71
+ version:
72
+ required_rubygems_version: !ruby/object:Gem::Requirement
73
+ requirements:
74
+ - - ">="
75
+ - !ruby/object:Gem::Version
76
+ version: "0"
77
+ version:
78
+ requirements: []
79
+
80
+ rubyforge_project:
81
+ rubygems_version: 1.3.5
82
+ signing_key:
83
+ specification_version: 3
84
+ summary: Extract content from tweets
85
+ test_files: []
86
+