ark_tweet_nlp 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: e1c7368edd30beda62c7081156d55cb42d459877
4
+ data.tar.gz: e422714ed6d24d3f74a7b12f0b13cc02556db649
5
+ SHA512:
6
+ metadata.gz: 355ae4a472b1bba1cc62fdba4c9017a2be8303e71fafceb4839c0a247bcf039e29603e1d1dbb762987349749c6b82c45dbb541c010e2e51b8277aa418c2525ec
7
+ data.tar.gz: 5c1ef397bc74910781a0c465dda5c0fe9baf1f6831aa9f21ef1affe9e2e8183841c719204cb1ff017b1cadd838410f59db54b078df526005465ecaf26a8c0828
data/.gitignore ADDED
@@ -0,0 +1,47 @@
1
+ ### OSX ###
2
+ .DS_Store
3
+ .AppleDouble
4
+ .LSOverride
5
+
6
+ # Icon must end with two \r
7
+ Icon
8
+
9
+
10
+ # Thumbnails
11
+ ._*
12
+
13
+ # Files that might appear on external disk
14
+ .Spotlight-V100
15
+ .Trashes
16
+
17
+ # Directories potentially created on remote AFP share
18
+ .AppleDB
19
+ .AppleDesktop
20
+ Network Trash Folder
21
+ Temporary Items
22
+ .apdisk
23
+
24
+ /.bundle/
25
+ /.yardoc
26
+ /Gemfile.lock
27
+ /_yardoc/
28
+ /coverage/
29
+ /doc/
30
+ /pkg/
31
+ /spec/reports/
32
+ /tmp/
33
+ *.bundle
34
+ *.so
35
+ *.o
36
+ *.a
37
+ mkmf.log
38
+ *.gem
39
+
40
+
41
+ ### vim ###
42
+ [._]*.s[a-w][a-z]
43
+ [._]s[a-w][a-z]
44
+ *.un~
45
+ Session.vim
46
+ .netrwhist
47
+ *~
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in ark_tweet_nlp.gemspec
4
+ gemspec
data/Guardfile ADDED
@@ -0,0 +1,5 @@
1
+ guard :rspec, cmd: 'rspec' do
2
+ watch(%r{^spec/.+_spec\.rb$})
3
+ watch(%r{^lib/(.+)\.rb$}) { |m| "spec/lib/#{m[1]}_spec.rb" }
4
+ watch('spec/spec_helper.rb') { "spec" }
5
+ end
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2014 Bernardo
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,53 @@
1
+ # ArkTweetNlp
2
+
3
+ Ruby wrapper for the [Carnegie Mellon Twitter NLP and Part-of-Speech Tagging](http://www.ark.cs.cmu.edu/TweetNLP/)
4
+ Not all features are implemented yet, check the examples to see how to use it.
5
+
6
+ ## Installation
7
+
8
+ Add this line to your application's Gemfile:
9
+
10
+ ```ruby
11
+ gem 'ark_tweet_nlp'
12
+ ```
13
+
14
+ And then execute:
15
+
16
+ $ bundle
17
+
18
+ Or install it yourself as:
19
+
20
+ $ gem install ark_tweet_nlp
21
+
22
+ ## Usage
23
+
24
+ See the list of supported tags:
25
+ ```ruby
26
+ ArkTweetNlp::Parser::TAGSET
27
+ ```
28
+
29
+ Tag a tweet text:
30
+ ```ruby
31
+ ArkTweetNlp::Parser.find_tags('faceboooooooook is awesome')
32
+ #=> [ {'faceboooooooook' => :^,'is' => :V,'awesome' => :A }]
33
+ ```
34
+
35
+ Or multiple tweets separated by \n:
36
+ ```ruby
37
+ ArkTweetNlp::Parser.find_tags("faceboooooooook is awesome\nfaceboooooooook is awesome")
38
+ #=> [{'faceboooooooook' => :^,'is' => :V,'awesome' => :A},{'faceboooooooook' => :^,'is' => :V,'awesome' => :A} ]
39
+ ```
40
+
41
+ Get all words tagged as a specific tag:
42
+ ```ruby
43
+ tagged_result = [{'faceboooooooook' => :^,'is' => :V,'awesome' => :A}]
44
+ ArkTweetNlp::Parser.get_words_tagged_as(tagged_result, :A,:V,:^)
45
+ #=> {:^ => ["faceboooooooook"], :V => ["is"], :A => ["awesome"]}
46
+ ```
47
+ ## Contributing
48
+
49
+ 1. Fork it ( https://github.com/[my-github-username]/ark_tweet_nlp/fork )
50
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
51
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
52
+ 4. Push to the branch (`git push origin my-new-feature`)
53
+ 5. Create a new Pull Request
data/Rakefile ADDED
@@ -0,0 +1,8 @@
1
+ require "rspec/core/rake_task"
2
+ require "bundler/gem_tasks"
3
+
4
+ RSpec::Core::RakeTask.new(:spec) do |task|
5
+ task.rspec_opts = [ '--color', '--format', 'nested' ]
6
+ end
7
+
8
+ task default: :spec
@@ -0,0 +1,32 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'ark_tweet_nlp/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "ark_tweet_nlp"
8
+ spec.version = ArkTweetNlp::VERSION
9
+ spec.authors = ["Bernardo"]
10
+ spec.email = ["bersimoes@gmail.com"]
11
+ spec.summary = %q{Ruby wrapper for the Carnegie Mellon Twitter NLP}
12
+ spec.description = %q{Tags tweets word into multiple cathegories using NLP}
13
+ spec.homepage = "https://github.com/golfadas/ark_tweet_nlp_ruby"
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files -z`.split("\x0")
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.add_development_dependency "bundler", "~> 1.7"
22
+ spec.add_development_dependency "rake", "~> 10.0"
23
+ spec.add_development_dependency "rspec"
24
+ spec.add_development_dependency "rspec-nc"
25
+ spec.add_development_dependency "pry"
26
+ spec.add_development_dependency 'pry-nav'
27
+ spec.add_development_dependency 'pry-rescue'
28
+ spec.add_development_dependency 'pry-stack_explorer'
29
+ spec.add_development_dependency 'pry-doc'
30
+ spec.add_development_dependency 'guard'
31
+ spec.add_development_dependency 'guard-rspec'
32
+ end
Binary file
data/bin/runTagger.sh ADDED
@@ -0,0 +1,5 @@
1
+ #!/bin/bash
2
+ set -eu
3
+
4
+ # Run the tagger (and tokenizer).
5
+ java -XX:ParallelGCThreads=2 -Xmx500m -jar $(dirname $0)/ark-tweet-nlp-0.3.2.jar "$@"
@@ -0,0 +1,72 @@
1
+ require 'set'
2
+
3
+ module ArkTweetNlp
4
+ module Parser
5
+ TAGSET = {
6
+ :N => 'common noun',
7
+ :O => 'pronoun, non possessive',
8
+ :^ => 'proper noun',
9
+ :S => 'nominal + possessive',
10
+ :Z => 'proper noun + possessive',
11
+ :V => 'verb including copula, auxiliaries',
12
+ :L => 'nominal + verbal (e.g. i’m), verbal + nominal (let’s)',
13
+ :M => 'proper noun + verbal',
14
+ :A => 'adjective',
15
+ :R => 'adverb',
16
+ :! => 'interjection',
17
+ :D => 'determiner',
18
+ :P => 'pre- or postposition, or subordinating conjunction',
19
+ :& => 'coordinating conjunction',
20
+ :T => 'verb particle',
21
+ :X => 'existential there, predeterminers',
22
+ :Y => 'X + verbal',
23
+ :'#' => 'hashtag (indicates topic/category for tweet)',
24
+ :'@' => 'at-mention (indicates a user as a recipient of a tweet)',
25
+ :~ => 'discourse marker, indications of continuation across multiple tweets',
26
+ :U => 'URL or email address',
27
+ :E => 'emoticon',
28
+ :'$' => 'numeral',
29
+ :',' => 'punctuation',
30
+ :G => 'other abbreviations, foreign words, possessive endings, symbols, garbage'
31
+ }
32
+ TAGGER_PATH = File.join(Dir.pwd , '/bin/runTagger.sh')
33
+
34
+ def Parser.ola
35
+ "ola"
36
+ end
37
+
38
+ def Parser.find_tags text
39
+ result = Parser.run_tagger(text)
40
+ result.split("\n").map{ |line| Parser.convert_line( line ) }
41
+ end
42
+
43
+ def Parser.get_words_tagged_as tagged_result, *tags
44
+ Parser.merge_array( tagged_result.map{ |e| Parser.safe_invert( e ).select{ |key| tags.include? key } })
45
+ end
46
+
47
+ private
48
+ def Parser.merge hash1, hash2
49
+ hash2.each{ |key, value| hash1[key] ||= Set.new; hash1[key] << value }
50
+ end
51
+
52
+ # merges all hashs inside array
53
+ def Parser.merge_array arr
54
+ arr.each.inject({}){ |res,hash| Parser.merge(res,hash) }
55
+ end
56
+
57
+ def Parser.run_tagger text
58
+ `echo '#{text}' | #{TAGGER_PATH}`
59
+ end
60
+
61
+ def Parser.convert_line line
62
+ text = line.split("\t")[0].split
63
+ tags = line.split("\t")[1].split
64
+ text.each.with_index.inject({}){ |result,(value,index)| result[value] = tags[index].to_sym; result }
65
+ end
66
+
67
+ def Parser.safe_invert hash
68
+ hash.each.inject({}){|sum,val| sum[val.last] ||= Set.new; sum[val.last] << val.first; sum}
69
+ end
70
+
71
+ end
72
+ end
@@ -0,0 +1,3 @@
1
+ module ArkTweetNlp
2
+ VERSION = "0.1.0"
3
+ end
@@ -0,0 +1,5 @@
1
+ require "ark_tweet_nlp/version"
2
+ require "ark_tweet_nlp/parser"
3
+
4
+ module ArkTweetNlp
5
+ end
@@ -0,0 +1,35 @@
1
+ require_relative "../lib/ark_tweet_nlp/parser.rb"
2
+
3
+ describe ArkTweetNlp::Parser do
4
+ describe 'TAGSET' do
5
+ it "It has 25 different types of tags" do
6
+ expect(ArkTweetNlp::Parser::TAGSET.size).to eq 25
7
+ end
8
+ end
9
+ describe '#find_tags' do
10
+ it "cathegorizes words in tweets" do
11
+ expect(ArkTweetNlp::Parser.find_tags('faceboooooooook is awesome')).to eq([ {'faceboooooooook' => :^,
12
+ 'is' => :V,
13
+ 'awesome' => :A }])
14
+ end
15
+ it "tags multiple tweets per line" do
16
+ expect(ArkTweetNlp::Parser.find_tags("faceboooooooook is awesome\nfaceboooooooook is awesome")).to eq([{'faceboooooooook' => :^,'is' => :V,'awesome' => :A},{'faceboooooooook' => :^,'is' => :V,'awesome' => :A} ])
17
+
18
+ end
19
+ end
20
+
21
+ describe '#get_words_tagged_as' do
22
+ it "returns only the words that where tagged with the specified tags" do
23
+ tagged_result =[{'faceboooooooook' => :^,'is' => :V,'awesome' => :A}]
24
+ expect(ArkTweetNlp::Parser.get_words_tagged_as(tagged_result, :A)).to eq(:A => (Set.new(['awesome']) ))
25
+ end
26
+ it "supports multiple tags" do
27
+ tagged_result = [{'faceboooooooook' => :^,'is' => :V,'awesome' => :A}]
28
+ expect(ArkTweetNlp::Parser.get_words_tagged_as(tagged_result, :A,:V,:^)).to eq( {:^ => Set.new(["faceboooooooook"]), :V => Set.new(["is"]), :A => Set.new(["awesome"]) })
29
+ end
30
+ it "supports muliple hashes" do
31
+ tagged_result = [{'faceboooooooook' => :^,'is' => :V,'awesome' => :A},{'faceboooooooook' => :^,'is' => :V,'awesome' => :A, 'blossom' => :A}]
32
+ expect(ArkTweetNlp::Parser.get_words_tagged_as(tagged_result, :A,:V,:^)).to eq( {:^ => Set.new(["faceboooooooook"]), :V => Set.new(["is"]), :A => Set.new(["awesome","blossom"]) })
33
+ end
34
+ end
35
+ end
@@ -0,0 +1,2 @@
1
+ require 'pry'
2
+ require 'ArkTweetNlp'
metadata ADDED
@@ -0,0 +1,216 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: ark_tweet_nlp
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Bernardo
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2014-09-11 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ~>
18
+ - !ruby/object:Gem::Version
19
+ version: '1.7'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ~>
25
+ - !ruby/object:Gem::Version
26
+ version: '1.7'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ~>
32
+ - !ruby/object:Gem::Version
33
+ version: '10.0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ~>
39
+ - !ruby/object:Gem::Version
40
+ version: '10.0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rspec
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - '>='
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - '>='
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: rspec-nc
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - '>='
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: pry
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - '>='
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - '>='
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: pry-nav
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - '>='
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - '>='
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
97
+ - !ruby/object:Gem::Dependency
98
+ name: pry-rescue
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - '>='
102
+ - !ruby/object:Gem::Version
103
+ version: '0'
104
+ type: :development
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - '>='
109
+ - !ruby/object:Gem::Version
110
+ version: '0'
111
+ - !ruby/object:Gem::Dependency
112
+ name: pry-stack_explorer
113
+ requirement: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - '>='
116
+ - !ruby/object:Gem::Version
117
+ version: '0'
118
+ type: :development
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - '>='
123
+ - !ruby/object:Gem::Version
124
+ version: '0'
125
+ - !ruby/object:Gem::Dependency
126
+ name: pry-doc
127
+ requirement: !ruby/object:Gem::Requirement
128
+ requirements:
129
+ - - '>='
130
+ - !ruby/object:Gem::Version
131
+ version: '0'
132
+ type: :development
133
+ prerelease: false
134
+ version_requirements: !ruby/object:Gem::Requirement
135
+ requirements:
136
+ - - '>='
137
+ - !ruby/object:Gem::Version
138
+ version: '0'
139
+ - !ruby/object:Gem::Dependency
140
+ name: guard
141
+ requirement: !ruby/object:Gem::Requirement
142
+ requirements:
143
+ - - '>='
144
+ - !ruby/object:Gem::Version
145
+ version: '0'
146
+ type: :development
147
+ prerelease: false
148
+ version_requirements: !ruby/object:Gem::Requirement
149
+ requirements:
150
+ - - '>='
151
+ - !ruby/object:Gem::Version
152
+ version: '0'
153
+ - !ruby/object:Gem::Dependency
154
+ name: guard-rspec
155
+ requirement: !ruby/object:Gem::Requirement
156
+ requirements:
157
+ - - '>='
158
+ - !ruby/object:Gem::Version
159
+ version: '0'
160
+ type: :development
161
+ prerelease: false
162
+ version_requirements: !ruby/object:Gem::Requirement
163
+ requirements:
164
+ - - '>='
165
+ - !ruby/object:Gem::Version
166
+ version: '0'
167
+ description: Tags tweets word into multiple cathegories using NLP
168
+ email:
169
+ - bersimoes@gmail.com
170
+ executables:
171
+ - ark-tweet-nlp-0.3.2.jar
172
+ - runTagger.sh
173
+ extensions: []
174
+ extra_rdoc_files: []
175
+ files:
176
+ - .gitignore
177
+ - Gemfile
178
+ - Guardfile
179
+ - LICENSE.txt
180
+ - README.md
181
+ - Rakefile
182
+ - ark_tweet_nlp.gemspec
183
+ - bin/ark-tweet-nlp-0.3.2.jar
184
+ - bin/runTagger.sh
185
+ - lib/ark_tweet_nlp.rb
186
+ - lib/ark_tweet_nlp/parser.rb
187
+ - lib/ark_tweet_nlp/version.rb
188
+ - spec/parser_spec.rb
189
+ - spec/spec_helper.rb
190
+ homepage: https://github.com/golfadas/ark_tweet_nlp_ruby
191
+ licenses:
192
+ - MIT
193
+ metadata: {}
194
+ post_install_message:
195
+ rdoc_options: []
196
+ require_paths:
197
+ - lib
198
+ required_ruby_version: !ruby/object:Gem::Requirement
199
+ requirements:
200
+ - - '>='
201
+ - !ruby/object:Gem::Version
202
+ version: '0'
203
+ required_rubygems_version: !ruby/object:Gem::Requirement
204
+ requirements:
205
+ - - '>='
206
+ - !ruby/object:Gem::Version
207
+ version: '0'
208
+ requirements: []
209
+ rubyforge_project:
210
+ rubygems_version: 2.0.14
211
+ signing_key:
212
+ specification_version: 4
213
+ summary: Ruby wrapper for the Carnegie Mellon Twitter NLP
214
+ test_files:
215
+ - spec/parser_spec.rb
216
+ - spec/spec_helper.rb