tweetparser 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/tweetparser.rb +3 -0
- data/lib/tweetparser/grammar.treetop +65 -0
- data/test/parser_test.rb +78 -0
- metadata +86 -0
data/lib/tweetparser.rb
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
grammar TweetContent
|
|
2
|
+
rule tweet
|
|
3
|
+
(url / html / space / newline / atref / hashtag / text)* {
|
|
4
|
+
def content
|
|
5
|
+
elements.map{ |e| e.content }
|
|
6
|
+
end
|
|
7
|
+
}
|
|
8
|
+
end
|
|
9
|
+
|
|
10
|
+
rule url
|
|
11
|
+
"http" "s"? "://" [\./a-zA-Z0-9\?#=\-_&%]+ {
|
|
12
|
+
def content
|
|
13
|
+
[:url, text_value]
|
|
14
|
+
end
|
|
15
|
+
}
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
rule atref
|
|
19
|
+
"@" [a-zA-Z0-9_]+ {
|
|
20
|
+
def content
|
|
21
|
+
[:atref, text_value]
|
|
22
|
+
end
|
|
23
|
+
}
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
rule hashtag
|
|
27
|
+
"#" [a-zA-Z0-9_]+ {
|
|
28
|
+
def content
|
|
29
|
+
[:hashtag, text_value]
|
|
30
|
+
end
|
|
31
|
+
}
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
rule text
|
|
35
|
+
([^h\s] / "h" !("ttp" "s"? "://"))+ {
|
|
36
|
+
def content
|
|
37
|
+
[:text, text_value]
|
|
38
|
+
end
|
|
39
|
+
}
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
rule html
|
|
43
|
+
"<" [^>]+ ">" {
|
|
44
|
+
def content
|
|
45
|
+
[:html, text_value]
|
|
46
|
+
end
|
|
47
|
+
}
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
rule newline
|
|
51
|
+
"\r"? "\n" {
|
|
52
|
+
def content
|
|
53
|
+
[:newline]
|
|
54
|
+
end
|
|
55
|
+
}
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
rule space
|
|
59
|
+
" "+ {
|
|
60
|
+
def content
|
|
61
|
+
[:space, text_value]
|
|
62
|
+
end
|
|
63
|
+
}
|
|
64
|
+
end
|
|
65
|
+
end
|
data/test/parser_test.rb
ADDED
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
# encoding: UTF-8
|
|
2
|
+
$:.unshift(File.expand_path("../../lib", __FILE__))
|
|
3
|
+
require "test/unit"
|
|
4
|
+
require "shoulda"
|
|
5
|
+
require "tweetparser"
|
|
6
|
+
|
|
7
|
+
class ParserTest < Test::Unit::TestCase
|
|
8
|
+
|
|
9
|
+
def setup
|
|
10
|
+
@parser = TweetContentParser.new
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
def assert_parses(expected, input)
|
|
14
|
+
actual = @parser.parse(input).content
|
|
15
|
+
assert_equal expected, actual
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
should "parse a blank string" do
|
|
19
|
+
assert_parses [], ""
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
should "extract url with query string and target" do
|
|
23
|
+
s = "https://mail.google.com/mail/?ui=2&shva=1#inbox"
|
|
24
|
+
assert_parses [[:url, s]], s
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
should "extract hashtag" do
|
|
28
|
+
s = "#HashTag2010"
|
|
29
|
+
assert_parses [[:hashtag, s]], s
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
should "extract at-references" do
|
|
33
|
+
s = "@AtRef_3000"
|
|
34
|
+
assert_parses [[:atref, s]], s
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
should "extract HTML" do
|
|
38
|
+
s = %{<some tag with="http://href.com/">}
|
|
39
|
+
assert_parses [[:html, s]], s
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
should "extract words spaces and new lines" do
|
|
43
|
+
s = "this string\nhas spaces!"
|
|
44
|
+
expected = [[:text, "this"], [:space, " "], [:text, "string"], [:newline],
|
|
45
|
+
[:text, "has"], [:space, " "], [:text, "spaces!"]]
|
|
46
|
+
assert_parses expected, s
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
should "extract everything from sample tweet" do
|
|
50
|
+
s = %{Another test: <a href="http://twitpic.com/14vzny" target="_blank"><img src="http://twitpic.com/show/mini/14vzny" /></a>\nhttp://twitpic.com/14vzny 3 http://twitpic.com/14vzny}
|
|
51
|
+
expected = [[:text, "Another"], [:space, " "], [:text, "test:"], [:space, " "],
|
|
52
|
+
[:html, "<a href=\"http://twitpic.com/14vzny\" target=\"_blank\">"],
|
|
53
|
+
[:html, "<img src=\"http://twitpic.com/show/mini/14vzny\" />"],
|
|
54
|
+
[:html, "</a>"], [:newline],
|
|
55
|
+
[:url, "http://twitpic.com/14vzny"],
|
|
56
|
+
[:space, " "], [:text, "3"], [:space, " "],
|
|
57
|
+
[:url, "http://twitpic.com/14vzny"]]
|
|
58
|
+
assert_parses expected, s
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
should "extract elements from real-world sample" do
|
|
62
|
+
s = %{RT @newsbrooke Tonight’s the night!: Hope you’ll all tune in tonight to watch On Expenses at 9pm on BBC4 http://bit.ly/cgbkmF #mps #uk}
|
|
63
|
+
expected = [[:text, "RT"], [:space, " "], [:atref, "@newsbrooke"], [:space, " "],
|
|
64
|
+
[:text, "Tonight’s"], [:space, " "], [:text, "the"], [:space, " "],
|
|
65
|
+
[:text, "night!:"], [:space, " "], [:text, "Hope"], [:space, " "],
|
|
66
|
+
[:text, "you’ll"], [:space, " "], [:text, "all"], [:space, " "],
|
|
67
|
+
[:text, "tune"], [:space, " "], [:text, "in"], [:space, " "],
|
|
68
|
+
[:text, "tonight"], [:space, " "], [:text, "to"], [:space, " "],
|
|
69
|
+
[:text, "watch"], [:space, " "], [:text, "On"], [:space, " "],
|
|
70
|
+
[:text, "Expenses"], [:space, " "], [:text, "at"], [:space, " "],
|
|
71
|
+
[:text, "9pm"], [:space, " "], [:text, "on"], [:space, " "],
|
|
72
|
+
[:text, "BBC4"], [:space, " "], [:url, "http://bit.ly/cgbkmF"], [:space, " "],
|
|
73
|
+
[:hashtag, "#mps"], [:space, " "], [:hashtag, "#uk"]]
|
|
74
|
+
assert_parses expected, s
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
end
|
|
78
|
+
|
metadata
ADDED
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
|
2
|
+
name: tweetparser
|
|
3
|
+
version: !ruby/object:Gem::Version
|
|
4
|
+
version: 0.1.0
|
|
5
|
+
platform: ruby
|
|
6
|
+
authors:
|
|
7
|
+
- Paul Battley
|
|
8
|
+
autorequire:
|
|
9
|
+
bindir: bin
|
|
10
|
+
cert_chain: []
|
|
11
|
+
|
|
12
|
+
date: 2010-02-23 00:00:00 +00:00
|
|
13
|
+
default_executable:
|
|
14
|
+
dependencies:
|
|
15
|
+
- !ruby/object:Gem::Dependency
|
|
16
|
+
name: treetop
|
|
17
|
+
type: :runtime
|
|
18
|
+
version_requirement:
|
|
19
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
20
|
+
requirements:
|
|
21
|
+
- - ~>
|
|
22
|
+
- !ruby/object:Gem::Version
|
|
23
|
+
version: 1.4.2
|
|
24
|
+
version:
|
|
25
|
+
- !ruby/object:Gem::Dependency
|
|
26
|
+
name: polyglot
|
|
27
|
+
type: :runtime
|
|
28
|
+
version_requirement:
|
|
29
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
30
|
+
requirements:
|
|
31
|
+
- - ~>
|
|
32
|
+
- !ruby/object:Gem::Version
|
|
33
|
+
version: 0.2.9
|
|
34
|
+
version:
|
|
35
|
+
- !ruby/object:Gem::Dependency
|
|
36
|
+
name: shoulda
|
|
37
|
+
type: :development
|
|
38
|
+
version_requirement:
|
|
39
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
40
|
+
requirements:
|
|
41
|
+
- - ">="
|
|
42
|
+
- !ruby/object:Gem::Version
|
|
43
|
+
version: "0"
|
|
44
|
+
version:
|
|
45
|
+
description:
|
|
46
|
+
email: pbattley@gmail.com
|
|
47
|
+
executables: []
|
|
48
|
+
|
|
49
|
+
extensions: []
|
|
50
|
+
|
|
51
|
+
extra_rdoc_files: []
|
|
52
|
+
|
|
53
|
+
files:
|
|
54
|
+
- test/parser_test.rb
|
|
55
|
+
- lib/tweetparser/grammar.treetop
|
|
56
|
+
- lib/tweetparser.rb
|
|
57
|
+
has_rdoc: true
|
|
58
|
+
homepage: http://github.com/madebymany/tweetparser
|
|
59
|
+
licenses: []
|
|
60
|
+
|
|
61
|
+
post_install_message:
|
|
62
|
+
rdoc_options: []
|
|
63
|
+
|
|
64
|
+
require_paths:
|
|
65
|
+
- lib
|
|
66
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
|
67
|
+
requirements:
|
|
68
|
+
- - ">="
|
|
69
|
+
- !ruby/object:Gem::Version
|
|
70
|
+
version: "0"
|
|
71
|
+
version:
|
|
72
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
73
|
+
requirements:
|
|
74
|
+
- - ">="
|
|
75
|
+
- !ruby/object:Gem::Version
|
|
76
|
+
version: "0"
|
|
77
|
+
version:
|
|
78
|
+
requirements: []
|
|
79
|
+
|
|
80
|
+
rubyforge_project:
|
|
81
|
+
rubygems_version: 1.3.5
|
|
82
|
+
signing_key:
|
|
83
|
+
specification_version: 3
|
|
84
|
+
summary: Extract content from tweets
|
|
85
|
+
test_files: []
|
|
86
|
+
|