tweetparser 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/tweetparser.rb +3 -0
- data/lib/tweetparser/grammar.treetop +65 -0
- data/test/parser_test.rb +78 -0
- metadata +86 -0
data/lib/tweetparser.rb
ADDED
@@ -0,0 +1,65 @@
|
|
1
|
+
grammar TweetContent
|
2
|
+
rule tweet
|
3
|
+
(url / html / space / newline / atref / hashtag / text)* {
|
4
|
+
def content
|
5
|
+
elements.map{ |e| e.content }
|
6
|
+
end
|
7
|
+
}
|
8
|
+
end
|
9
|
+
|
10
|
+
rule url
|
11
|
+
"http" "s"? "://" [\./a-zA-Z0-9\?#=\-_&%]+ {
|
12
|
+
def content
|
13
|
+
[:url, text_value]
|
14
|
+
end
|
15
|
+
}
|
16
|
+
end
|
17
|
+
|
18
|
+
rule atref
|
19
|
+
"@" [a-zA-Z0-9_]+ {
|
20
|
+
def content
|
21
|
+
[:atref, text_value]
|
22
|
+
end
|
23
|
+
}
|
24
|
+
end
|
25
|
+
|
26
|
+
rule hashtag
|
27
|
+
"#" [a-zA-Z0-9_]+ {
|
28
|
+
def content
|
29
|
+
[:hashtag, text_value]
|
30
|
+
end
|
31
|
+
}
|
32
|
+
end
|
33
|
+
|
34
|
+
rule text
|
35
|
+
([^h\s] / "h" !("ttp" "s"? "://"))+ {
|
36
|
+
def content
|
37
|
+
[:text, text_value]
|
38
|
+
end
|
39
|
+
}
|
40
|
+
end
|
41
|
+
|
42
|
+
rule html
|
43
|
+
"<" [^>]+ ">" {
|
44
|
+
def content
|
45
|
+
[:html, text_value]
|
46
|
+
end
|
47
|
+
}
|
48
|
+
end
|
49
|
+
|
50
|
+
rule newline
|
51
|
+
"\r"? "\n" {
|
52
|
+
def content
|
53
|
+
[:newline]
|
54
|
+
end
|
55
|
+
}
|
56
|
+
end
|
57
|
+
|
58
|
+
rule space
|
59
|
+
" "+ {
|
60
|
+
def content
|
61
|
+
[:space, text_value]
|
62
|
+
end
|
63
|
+
}
|
64
|
+
end
|
65
|
+
end
|
data/test/parser_test.rb
ADDED
@@ -0,0 +1,78 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
$:.unshift(File.expand_path("../../lib", __FILE__))
|
3
|
+
require "test/unit"
|
4
|
+
require "shoulda"
|
5
|
+
require "tweetparser"
|
6
|
+
|
7
|
+
class ParserTest < Test::Unit::TestCase
|
8
|
+
|
9
|
+
def setup
|
10
|
+
@parser = TweetContentParser.new
|
11
|
+
end
|
12
|
+
|
13
|
+
def assert_parses(expected, input)
|
14
|
+
actual = @parser.parse(input).content
|
15
|
+
assert_equal expected, actual
|
16
|
+
end
|
17
|
+
|
18
|
+
should "parse a blank string" do
|
19
|
+
assert_parses [], ""
|
20
|
+
end
|
21
|
+
|
22
|
+
should "extract url with query string and target" do
|
23
|
+
s = "https://mail.google.com/mail/?ui=2&shva=1#inbox"
|
24
|
+
assert_parses [[:url, s]], s
|
25
|
+
end
|
26
|
+
|
27
|
+
should "extract hashtag" do
|
28
|
+
s = "#HashTag2010"
|
29
|
+
assert_parses [[:hashtag, s]], s
|
30
|
+
end
|
31
|
+
|
32
|
+
should "extract at-references" do
|
33
|
+
s = "@AtRef_3000"
|
34
|
+
assert_parses [[:atref, s]], s
|
35
|
+
end
|
36
|
+
|
37
|
+
should "extract HTML" do
|
38
|
+
s = %{<some tag with="http://href.com/">}
|
39
|
+
assert_parses [[:html, s]], s
|
40
|
+
end
|
41
|
+
|
42
|
+
should "extract words spaces and new lines" do
|
43
|
+
s = "this string\nhas spaces!"
|
44
|
+
expected = [[:text, "this"], [:space, " "], [:text, "string"], [:newline],
|
45
|
+
[:text, "has"], [:space, " "], [:text, "spaces!"]]
|
46
|
+
assert_parses expected, s
|
47
|
+
end
|
48
|
+
|
49
|
+
should "extract everything from sample tweet" do
|
50
|
+
s = %{Another test: <a href="http://twitpic.com/14vzny" target="_blank"><img src="http://twitpic.com/show/mini/14vzny" /></a>\nhttp://twitpic.com/14vzny 3 http://twitpic.com/14vzny}
|
51
|
+
expected = [[:text, "Another"], [:space, " "], [:text, "test:"], [:space, " "],
|
52
|
+
[:html, "<a href=\"http://twitpic.com/14vzny\" target=\"_blank\">"],
|
53
|
+
[:html, "<img src=\"http://twitpic.com/show/mini/14vzny\" />"],
|
54
|
+
[:html, "</a>"], [:newline],
|
55
|
+
[:url, "http://twitpic.com/14vzny"],
|
56
|
+
[:space, " "], [:text, "3"], [:space, " "],
|
57
|
+
[:url, "http://twitpic.com/14vzny"]]
|
58
|
+
assert_parses expected, s
|
59
|
+
end
|
60
|
+
|
61
|
+
should "extract elements from real-world sample" do
|
62
|
+
s = %{RT @newsbrooke Tonight’s the night!: Hope you’ll all tune in tonight to watch On Expenses at 9pm on BBC4 http://bit.ly/cgbkmF #mps #uk}
|
63
|
+
expected = [[:text, "RT"], [:space, " "], [:atref, "@newsbrooke"], [:space, " "],
|
64
|
+
[:text, "Tonight’s"], [:space, " "], [:text, "the"], [:space, " "],
|
65
|
+
[:text, "night!:"], [:space, " "], [:text, "Hope"], [:space, " "],
|
66
|
+
[:text, "you’ll"], [:space, " "], [:text, "all"], [:space, " "],
|
67
|
+
[:text, "tune"], [:space, " "], [:text, "in"], [:space, " "],
|
68
|
+
[:text, "tonight"], [:space, " "], [:text, "to"], [:space, " "],
|
69
|
+
[:text, "watch"], [:space, " "], [:text, "On"], [:space, " "],
|
70
|
+
[:text, "Expenses"], [:space, " "], [:text, "at"], [:space, " "],
|
71
|
+
[:text, "9pm"], [:space, " "], [:text, "on"], [:space, " "],
|
72
|
+
[:text, "BBC4"], [:space, " "], [:url, "http://bit.ly/cgbkmF"], [:space, " "],
|
73
|
+
[:hashtag, "#mps"], [:space, " "], [:hashtag, "#uk"]]
|
74
|
+
assert_parses expected, s
|
75
|
+
end
|
76
|
+
|
77
|
+
end
|
78
|
+
|
metadata
ADDED
@@ -0,0 +1,86 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: tweetparser
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Paul Battley
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2010-02-23 00:00:00 +00:00
|
13
|
+
default_executable:
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: treetop
|
17
|
+
type: :runtime
|
18
|
+
version_requirement:
|
19
|
+
version_requirements: !ruby/object:Gem::Requirement
|
20
|
+
requirements:
|
21
|
+
- - ~>
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: 1.4.2
|
24
|
+
version:
|
25
|
+
- !ruby/object:Gem::Dependency
|
26
|
+
name: polyglot
|
27
|
+
type: :runtime
|
28
|
+
version_requirement:
|
29
|
+
version_requirements: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ~>
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: 0.2.9
|
34
|
+
version:
|
35
|
+
- !ruby/object:Gem::Dependency
|
36
|
+
name: shoulda
|
37
|
+
type: :development
|
38
|
+
version_requirement:
|
39
|
+
version_requirements: !ruby/object:Gem::Requirement
|
40
|
+
requirements:
|
41
|
+
- - ">="
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
version: "0"
|
44
|
+
version:
|
45
|
+
description:
|
46
|
+
email: pbattley@gmail.com
|
47
|
+
executables: []
|
48
|
+
|
49
|
+
extensions: []
|
50
|
+
|
51
|
+
extra_rdoc_files: []
|
52
|
+
|
53
|
+
files:
|
54
|
+
- test/parser_test.rb
|
55
|
+
- lib/tweetparser/grammar.treetop
|
56
|
+
- lib/tweetparser.rb
|
57
|
+
has_rdoc: true
|
58
|
+
homepage: http://github.com/madebymany/tweetparser
|
59
|
+
licenses: []
|
60
|
+
|
61
|
+
post_install_message:
|
62
|
+
rdoc_options: []
|
63
|
+
|
64
|
+
require_paths:
|
65
|
+
- lib
|
66
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
67
|
+
requirements:
|
68
|
+
- - ">="
|
69
|
+
- !ruby/object:Gem::Version
|
70
|
+
version: "0"
|
71
|
+
version:
|
72
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
73
|
+
requirements:
|
74
|
+
- - ">="
|
75
|
+
- !ruby/object:Gem::Version
|
76
|
+
version: "0"
|
77
|
+
version:
|
78
|
+
requirements: []
|
79
|
+
|
80
|
+
rubyforge_project:
|
81
|
+
rubygems_version: 1.3.5
|
82
|
+
signing_key:
|
83
|
+
specification_version: 3
|
84
|
+
summary: Extract content from tweets
|
85
|
+
test_files: []
|
86
|
+
|