phrasie 0.1.3
Sign up to get free protection for your applications and to get access to all the features.
- data/Manifest.txt +14 -0
- data/README.md +102 -0
- data/Rakefile +15 -0
- data/lib/phrasie.rb +8 -0
- data/lib/phrasie/data/english-lexicon.txt +93718 -0
- data/lib/phrasie/extractor.rb +68 -0
- data/lib/phrasie/rules.rb +82 -0
- data/lib/phrasie/tag.rb +62 -0
- data/phrasie.gemspec +17 -0
- data/script/console +10 -0
- data/script/destroy +14 -0
- data/script/generate +14 -0
- data/test/test_helper.rb +3 -0
- data/test/test_phrasie.rb +104 -0
- metadata +83 -0
@@ -0,0 +1,68 @@
|
|
1
|
+
SEARCH = 0
|
2
|
+
NOUN = 1
|
3
|
+
|
4
|
+
module Phrasie
|
5
|
+
class Extractor
|
6
|
+
attr_accessor :tagger, :filter
|
7
|
+
|
8
|
+
def initialize(options={})
|
9
|
+
self.tagger = Tagger.new
|
10
|
+
self.filter = options[:filter] || {:strength => 2, :occur => 3}
|
11
|
+
end
|
12
|
+
|
13
|
+
def to_s
|
14
|
+
"#<Phrasie::Extractor>"
|
15
|
+
end
|
16
|
+
|
17
|
+
def phrases(input, min_occur=3)
|
18
|
+
if input.is_a? String
|
19
|
+
taggedTerms = self.tagger.tag(input)
|
20
|
+
elsif input.is_a? Array
|
21
|
+
taggedTerms = input
|
22
|
+
else
|
23
|
+
return []
|
24
|
+
end
|
25
|
+
|
26
|
+
terms = {}
|
27
|
+
multiterm = []
|
28
|
+
state = SEARCH
|
29
|
+
|
30
|
+
while taggedTerms.size > 0
|
31
|
+
term, tag, norm = taggedTerms.shift
|
32
|
+
if state == SEARCH && tag[0] == "N"
|
33
|
+
state = NOUN
|
34
|
+
add(term, norm, multiterm, terms)
|
35
|
+
elsif state == SEARCH && tag == 'JJ' && term[0].upcase == term[0]
|
36
|
+
state = NOUN
|
37
|
+
add(term, norm, multiterm, terms)
|
38
|
+
elsif state == NOUN && tag[0] == "N"
|
39
|
+
add(term, norm, multiterm, terms)
|
40
|
+
elsif state == NOUN && tag[0] != "N"
|
41
|
+
state = SEARCH
|
42
|
+
if multiterm.size > 1
|
43
|
+
word = multiterm.map(&:first).join(' ')
|
44
|
+
terms[word] ||= 0
|
45
|
+
terms[word] += 1
|
46
|
+
end
|
47
|
+
multiterm = []
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
return terms \
|
52
|
+
.map{|phrase, occurance| [phrase, occurance, phrase.split.size] } \
|
53
|
+
.keep_if{|arr| self.validate(*arr)} \
|
54
|
+
.sort_by{|phrase, occurance, strength| occurance + ((occurance/5.0)*strength) }.reverse
|
55
|
+
end
|
56
|
+
|
57
|
+
protected
|
58
|
+
def validate(word, occur, strength)
|
59
|
+
occur >= self.filter[:occur] || (occur >= 2 && strength >= self.filter[:strength])
|
60
|
+
end
|
61
|
+
|
62
|
+
def add(term, norm, multiterm, terms)
|
63
|
+
multiterm << [term, norm]
|
64
|
+
terms[norm] ||= 0
|
65
|
+
terms[norm] += 1
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
@@ -0,0 +1,82 @@
|
|
1
|
+
module Phrasie
|
2
|
+
module Rules
|
3
|
+
# Determine whether a default noun is plural or singular.
|
4
|
+
def correctDefaultNounTag(id, tagged_term, tagged_terms)
|
5
|
+
term, tag, norm = tagged_term
|
6
|
+
if tag == 'NND'
|
7
|
+
if term[-1] == 's'
|
8
|
+
tagged_term[1] = 'NNS'
|
9
|
+
tagged_term[2] = term[0..-2]
|
10
|
+
else
|
11
|
+
tagged_term[1] = 'NN'
|
12
|
+
end
|
13
|
+
end
|
14
|
+
return [id, tagged_term, tagged_terms]
|
15
|
+
end
|
16
|
+
|
17
|
+
# Verify that noun at sentence start is truly proper.
|
18
|
+
def verifyProperNounAtSentenceStart(id, tagged_term, tagged_terms)
|
19
|
+
term, tag, norm = tagged_term
|
20
|
+
if ['NNP', 'NNPS'].include?(tag) && (id == 0 || tagged_terms[id-1][1] == '.')
|
21
|
+
lower_term = term.downcase
|
22
|
+
lower_tag = self.tags_by_term[lower_term]
|
23
|
+
if ['NN', 'NNS'].include?(lower_tag)
|
24
|
+
tagged_term[0] = tagged_term[2] = lower_term
|
25
|
+
tagged_term[1] = lower_tag
|
26
|
+
end
|
27
|
+
end
|
28
|
+
return [id, tagged_term, tagged_terms]
|
29
|
+
end
|
30
|
+
|
31
|
+
# Determine the verb after a modal verb to avoid accidental noun detection.
|
32
|
+
def determineVerbAfterModal(id, tagged_term, tagged_terms)
|
33
|
+
term, tag, norm = tagged_term
|
34
|
+
return [id, tagged_term, tagged_terms] if tag != 'MD'
|
35
|
+
len_terms = tagged_terms.size
|
36
|
+
i = id
|
37
|
+
i += 1
|
38
|
+
while i < len_terms
|
39
|
+
if tagged_terms[i][1] == 'RB'
|
40
|
+
i += 1
|
41
|
+
next
|
42
|
+
end
|
43
|
+
|
44
|
+
if tagged_terms[i][1] == 'NN'
|
45
|
+
tagged_terms[i][1] = 'VB'
|
46
|
+
end
|
47
|
+
|
48
|
+
break
|
49
|
+
end
|
50
|
+
|
51
|
+
return [id, tagged_term, tagged_terms]
|
52
|
+
end
|
53
|
+
|
54
|
+
|
55
|
+
def normalizePluralForms(id, tagged_term, tagged_terms)
|
56
|
+
term, tag, norm = tagged_term
|
57
|
+
if ['NNS', 'NNPS'].include?(tag) && term == norm
|
58
|
+
# Plural form ends in "s"
|
59
|
+
singular = term[0..-2]
|
60
|
+
if term[-1] && !self.tags_by_term[singular].nil?
|
61
|
+
tagged_term[2] = singular
|
62
|
+
return [id, tagged_term, tagged_terms]
|
63
|
+
end
|
64
|
+
|
65
|
+
# Plural form ends in "es"
|
66
|
+
singular = term[0..-3]
|
67
|
+
if term[-2..-1] && !self.tags_by_term[singular].nil?
|
68
|
+
tagged_term[2] = singular
|
69
|
+
return [id, tagged_term, tagged_terms]
|
70
|
+
end
|
71
|
+
|
72
|
+
# Plural form ends in "ies" (from "y")
|
73
|
+
singular = term[0..-4]+'y'
|
74
|
+
if term[-3..-1] && !self.tags_by_term[singular].nil?
|
75
|
+
tagged_term[2] = singular
|
76
|
+
return [id, tagged_term, tagged_terms]
|
77
|
+
end
|
78
|
+
end
|
79
|
+
return [id, tagged_term, tagged_terms]
|
80
|
+
end
|
81
|
+
end
|
82
|
+
end
|
data/lib/phrasie/tag.rb
ADDED
@@ -0,0 +1,62 @@
|
|
1
|
+
module Phrasie
|
2
|
+
class Tagger
|
3
|
+
include Phrasie::Rules
|
4
|
+
TERM_SPEC = /([^a-zA-Z]*)([a-zA-Z\-\.]*[a-zA-Z])([^a-zA-Z]*[a-zA-Z]*)/
|
5
|
+
attr_accessor :language, :tags_by_term, :lexicon
|
6
|
+
|
7
|
+
def initialize(options={})
|
8
|
+
self.language = options[:language] || 'english'
|
9
|
+
self.lexicon = options[:lexicon] || File.expand_path("#{__FILE__}/../data/#{self.language}-lexicon.txt")
|
10
|
+
file = File.read(self.lexicon)
|
11
|
+
self.tags_by_term = Hash[file.split("\n").map{|line| line.split.first(2)}]
|
12
|
+
end
|
13
|
+
|
14
|
+
def tokenize(text)
|
15
|
+
terms = []
|
16
|
+
text.split(/\s/).each do |term|
|
17
|
+
next if term == ''
|
18
|
+
match = TERM_SPEC.match(term).to_a
|
19
|
+
match.shift
|
20
|
+
if match.size == 0
|
21
|
+
terms << term
|
22
|
+
next
|
23
|
+
end
|
24
|
+
|
25
|
+
match.each do |sub_term|
|
26
|
+
terms << sub_term if sub_term != ''
|
27
|
+
end
|
28
|
+
end
|
29
|
+
return terms
|
30
|
+
end
|
31
|
+
|
32
|
+
def tag(input)
|
33
|
+
if input.is_a? String
|
34
|
+
terms = self.tokenize(input)
|
35
|
+
elsif input.is_a? Array
|
36
|
+
terms = input
|
37
|
+
else
|
38
|
+
return []
|
39
|
+
end
|
40
|
+
|
41
|
+
tagged_terms = []
|
42
|
+
terms.each do |term|
|
43
|
+
tag = self.tags_by_term[term] || "NND"
|
44
|
+
tagged_terms << [term, tag, term]
|
45
|
+
end
|
46
|
+
|
47
|
+
rules = [
|
48
|
+
'correctDefaultNounTag',
|
49
|
+
'verifyProperNounAtSentenceStart',
|
50
|
+
'determineVerbAfterModal',
|
51
|
+
'normalizePluralForms'
|
52
|
+
]
|
53
|
+
|
54
|
+
tagged_terms.each_with_index do |tagged_term, id|
|
55
|
+
rules.each do |rule|
|
56
|
+
id, tagged_terms[id], tagged_terms = self.send(rule.to_sym, id, tagged_term, tagged_terms)
|
57
|
+
end
|
58
|
+
end
|
59
|
+
return tagged_terms
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
data/phrasie.gemspec
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
Gem::Specification.new do |s|
|
2
|
+
s.name = "phrasie"
|
3
|
+
s.version = '0.1.3'
|
4
|
+
s.authors = ["Ashley Williams"]
|
5
|
+
s.email = ["hi@ashleyw.co.uk"]
|
6
|
+
s.summary = "Determines important terms within a given piece of content."
|
7
|
+
s.homepage = "https://github.com/ashleyw/Phrasie"
|
8
|
+
s.description = "Determines important terms within a given piece of content. It
|
9
|
+
uses linguistic tools such as Parts-Of-Speech (POS) and some simple
|
10
|
+
statistical analysis to determine the terms and their strength."
|
11
|
+
|
12
|
+
s.required_rubygems_version = ">= 1.3.6"
|
13
|
+
s.rubyforge_project = "phrasie"
|
14
|
+
|
15
|
+
s.files = File.read("Manifest.txt").split("\n")
|
16
|
+
s.require_path = 'lib'
|
17
|
+
end
|
data/script/console
ADDED
@@ -0,0 +1,10 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# File: script/console
|
3
|
+
irb = RUBY_PLATFORM =~ /(:?mswin|mingw)/ ? 'irb.bat' : 'irb'
|
4
|
+
|
5
|
+
libs = " -r irb/completion"
|
6
|
+
# Perhaps use a console_lib to store any extra methods I may want available in the cosole
|
7
|
+
# libs << " -r #{File.dirname(__FILE__) + '/../lib/console_lib/console_logger.rb'}"
|
8
|
+
libs << " -r #{File.dirname(__FILE__) + '/../lib/phrasie.rb'}"
|
9
|
+
puts "Loading phrasie gem"
|
10
|
+
exec "#{irb} #{libs} --simple-prompt"
|
data/script/destroy
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
APP_ROOT = File.expand_path(File.join(File.dirname(__FILE__), '..'))
|
3
|
+
|
4
|
+
begin
|
5
|
+
require 'rubigen'
|
6
|
+
rescue LoadError
|
7
|
+
require 'rubygems'
|
8
|
+
require 'rubigen'
|
9
|
+
end
|
10
|
+
require 'rubigen/scripts/destroy'
|
11
|
+
|
12
|
+
ARGV.shift if ['--help', '-h'].include?(ARGV[0])
|
13
|
+
RubiGen::Base.use_component_sources! [:rubygems, :newgem, :newgem_theme, :test_unit]
|
14
|
+
RubiGen::Scripts::Destroy.new.run(ARGV)
|
data/script/generate
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
APP_ROOT = File.expand_path(File.join(File.dirname(__FILE__), '..'))
|
3
|
+
|
4
|
+
begin
|
5
|
+
require 'rubigen'
|
6
|
+
rescue LoadError
|
7
|
+
require 'rubygems'
|
8
|
+
require 'rubigen'
|
9
|
+
end
|
10
|
+
require 'rubigen/scripts/generate'
|
11
|
+
|
12
|
+
ARGV.shift if ['--help', '-h'].include?(ARGV[0])
|
13
|
+
RubiGen::Base.use_component_sources! [:rubygems, :newgem, :newgem_theme, :test_unit]
|
14
|
+
RubiGen::Scripts::Generate.new.run(ARGV)
|
data/test/test_helper.rb
ADDED
@@ -0,0 +1,104 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/test_helper.rb'
|
2
|
+
|
3
|
+
class TestPhrasie < Test::Unit::TestCase
|
4
|
+
def setup
|
5
|
+
@text = 'The British consul of Boston resides in Newton. The British consul is awesome.'
|
6
|
+
@long_text = %(Police shut Palestinian theatre in Jerusalem.
|
7
|
+
|
8
|
+
Israeli police have shut down a Palestinian theatre in East Jerusalem.
|
9
|
+
|
10
|
+
The action, on Thursday, prevented the closing event of an international
|
11
|
+
literature festival from taking place.
|
12
|
+
|
13
|
+
Police said they were acting on a court order, issued after intelligence
|
14
|
+
indicated that the Palestinian Authority was involved in the event.
|
15
|
+
|
16
|
+
Israel has occupied East Jerusalem since 1967 and has annexed the
|
17
|
+
area. This is not recognised by the international community.
|
18
|
+
|
19
|
+
The British consul-general in Jerusalem , Richard Makepeace, was
|
20
|
+
attending the event.
|
21
|
+
|
22
|
+
"I think all lovers of literature would regard this as a very
|
23
|
+
regrettable moment and regrettable decision," he added.
|
24
|
+
|
25
|
+
Mr Makepeace said the festival's closing event would be reorganised to
|
26
|
+
take place at the British Council in Jerusalem.
|
27
|
+
|
28
|
+
The Israeli authorities often take action against events in East
|
29
|
+
Jerusalem they see as connected to the Palestinian Authority.
|
30
|
+
|
31
|
+
Saturday's opening event at the same theatre was also shut down.
|
32
|
+
|
33
|
+
A police notice said the closure was on the orders of Israel's internal
|
34
|
+
security minister on the grounds of a breach of interim peace accords
|
35
|
+
from the 1990s.
|
36
|
+
|
37
|
+
These laid the framework for talks on establishing a Palestinian state
|
38
|
+
alongside Israel, but left the status of Jerusalem to be determined by
|
39
|
+
further negotiation.
|
40
|
+
|
41
|
+
Israel has annexed East Jerusalem and declares it part of its eternal
|
42
|
+
capital.
|
43
|
+
|
44
|
+
Palestinians hope to establish their capital in the area.)
|
45
|
+
@extractor = Phrasie::Extractor.new
|
46
|
+
end
|
47
|
+
|
48
|
+
def test_extractor
|
49
|
+
expected = [["British consul", 2, 2]]
|
50
|
+
assert_equal expected, @extractor.phrases(@text).sort_by{|a| a[1]}
|
51
|
+
end
|
52
|
+
|
53
|
+
def test_non_words
|
54
|
+
text = %(LONDON - WikiLeaks founder Julian Assange was refused bail and jailed for a week by a British court Tuesday, pending an extradition hearing over alleged sex offenses in Sweden.
|
55
|
+
Assange turned himself in to U.K. police earlier in the day in the latest blow to his WikiLeaks organization, which faces legal, financial and technological challenges after releasing hundreds of secret U.S. diplomatic cables.
|
56
|
+
Swedish prosecutors had issued an arrest warrant for the 39-year-old Australian, who is accused of sexual misconduct with two women.
|
57
|
+
Assange surrendered at 9:30 a.m. local time (4:30 a.m. ET) Tuesday. The U.K.'s Guardian newspaper reported that Assange later arrived at a London court accompanied by British lawyers Mark Stephens and Jennifer Robinson.
|
58
|
+
During his court appearance, Assange said he would fight extradition to Sweden and he provided the court with an Australian address. Britain's Sky News reported that Assange was receiving consular assistance from officials at the Australian High Commission.
|
59
|
+
The next court hearing is scheduled for next Tuesday, and Assange will remain in custody until then because he was deemed to be a flight risk.
|
60
|
+
Judge Howard Riddle asked Assange whether he understood that he could agree to be extradited to Sweden. Assange, dressed in a navy blue suit, cleared his throat and said: "I understand that and I do not consent."
|
61
|
+
The judge said he had grounds to believe that the former computer hacker - a self-described homeless refugee - might not show up to his next hearing if he were granted bail.
|
62
|
+
Arguments during the hour-long hearing detailed the sex accusations against Assange, all of which he has denied.
|
63
|
+
Australian journalist John Pilger, British film director Ken Loach and Jemima Khan, former wife of Pakistani cricketer and politician Imran Khan, all offered to put up sureties to persuade the court Assange would not flee.)
|
64
|
+
assert_equal 7, @extractor.phrases(text).size
|
65
|
+
end
|
66
|
+
|
67
|
+
# [["Jerusalem", 8, 1],
|
68
|
+
# ["event", 6, 1],
|
69
|
+
# ["Palestinian", 6, 1],
|
70
|
+
# ["East Jerusalem", 4, 2],
|
71
|
+
# ["East", 4, 1],
|
72
|
+
# ["police", 4, 1],
|
73
|
+
# ["Israel", 4, 1],
|
74
|
+
# ["theatre", 3, 1],
|
75
|
+
# ["Palestinian theatre", 2, 2],
|
76
|
+
# ["Palestinian Authority", 2, 2],
|
77
|
+
# ["opening event", 1, 2],
|
78
|
+
# ["Israeli authorities", 1, 2],
|
79
|
+
# ["Richard Makepeace", 1, 2],
|
80
|
+
# ["court order", 1, 2],
|
81
|
+
# ["literature festival", 1, 2],
|
82
|
+
# ["British consul-general", 1, 2],
|
83
|
+
# ["police notice", 1, 2],
|
84
|
+
# ["security minister", 1, 2],
|
85
|
+
# ["Israeli police", 1, 2],
|
86
|
+
# ["peace accords", 1, 2],
|
87
|
+
# ["Mr Makepeace", 1, 2],
|
88
|
+
# ["British Council", 1, 2],
|
89
|
+
# ["Palestinian state", 1, 2],
|
90
|
+
# ["Palestinians hope", 1, 2]]
|
91
|
+
|
92
|
+
def test_long_text
|
93
|
+
assert_equal 10, @extractor.phrases(@long_text).size
|
94
|
+
end
|
95
|
+
|
96
|
+
def test_filter_options
|
97
|
+
@extractor.filter = {:occur => 4, :strength => 3}
|
98
|
+
assert_equal 7, @extractor.phrases(@long_text).size
|
99
|
+
end
|
100
|
+
|
101
|
+
def test_extractor_to_s
|
102
|
+
assert @extractor.to_s == "#<Phrasie::Extractor>"
|
103
|
+
end
|
104
|
+
end
|
metadata
ADDED
@@ -0,0 +1,83 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: phrasie
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
prerelease: false
|
5
|
+
segments:
|
6
|
+
- 0
|
7
|
+
- 1
|
8
|
+
- 3
|
9
|
+
version: 0.1.3
|
10
|
+
platform: ruby
|
11
|
+
authors:
|
12
|
+
- Ashley Williams
|
13
|
+
autorequire:
|
14
|
+
bindir: bin
|
15
|
+
cert_chain: []
|
16
|
+
|
17
|
+
date: 2011-01-17 00:00:00 +00:00
|
18
|
+
default_executable:
|
19
|
+
dependencies: []
|
20
|
+
|
21
|
+
description: |-
|
22
|
+
Determines important terms within a given piece of content. It
|
23
|
+
uses linguistic tools such as Parts-Of-Speech (POS) and some simple
|
24
|
+
statistical analysis to determine the terms and their strength.
|
25
|
+
email:
|
26
|
+
- hi@ashleyw.co.uk
|
27
|
+
executables: []
|
28
|
+
|
29
|
+
extensions: []
|
30
|
+
|
31
|
+
extra_rdoc_files: []
|
32
|
+
|
33
|
+
files:
|
34
|
+
- Manifest.txt
|
35
|
+
- README.md
|
36
|
+
- Rakefile
|
37
|
+
- lib/phrasie.rb
|
38
|
+
- lib/phrasie/extractor.rb
|
39
|
+
- lib/phrasie/rules.rb
|
40
|
+
- lib/phrasie/tag.rb
|
41
|
+
- lib/phrasie/data/english-lexicon.txt
|
42
|
+
- script/console
|
43
|
+
- script/destroy
|
44
|
+
- script/generate
|
45
|
+
- phrasie.gemspec
|
46
|
+
- test/test_helper.rb
|
47
|
+
- test/test_phrasie.rb
|
48
|
+
has_rdoc: true
|
49
|
+
homepage: https://github.com/ashleyw/Phrasie
|
50
|
+
licenses: []
|
51
|
+
|
52
|
+
post_install_message:
|
53
|
+
rdoc_options: []
|
54
|
+
|
55
|
+
require_paths:
|
56
|
+
- lib
|
57
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
58
|
+
none: false
|
59
|
+
requirements:
|
60
|
+
- - ">="
|
61
|
+
- !ruby/object:Gem::Version
|
62
|
+
segments:
|
63
|
+
- 0
|
64
|
+
version: "0"
|
65
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
66
|
+
none: false
|
67
|
+
requirements:
|
68
|
+
- - ">="
|
69
|
+
- !ruby/object:Gem::Version
|
70
|
+
segments:
|
71
|
+
- 1
|
72
|
+
- 3
|
73
|
+
- 6
|
74
|
+
version: 1.3.6
|
75
|
+
requirements: []
|
76
|
+
|
77
|
+
rubyforge_project: phrasie
|
78
|
+
rubygems_version: 1.3.7
|
79
|
+
signing_key:
|
80
|
+
specification_version: 3
|
81
|
+
summary: Determines important terms within a given piece of content.
|
82
|
+
test_files: []
|
83
|
+
|