phrasie 0.1.3 → 0.1.4
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +1 -1
- data/lib/phrasie.rb +5 -3
- data/lib/phrasie/extractor.rb +21 -10
- data/lib/phrasie/tag.rb +7 -1
- data/phrasie.gemspec +1 -1
- data/test/test_phrasie.rb +0 -25
- metadata +2 -2
data/README.md
CHANGED
@@ -8,7 +8,7 @@ statistical analysis to determine the terms and their strength.
|
|
8
8
|
|
9
9
|
Based on the excellent Python library [topia.termextract](http://pypi.python.org/pypi/topia.termextract/).
|
10
10
|
|
11
|
-
**Tested on Ruby 1.9.2.**
|
11
|
+
**Tested on Ruby 1.8.7 and 1.9.2.**
|
12
12
|
|
13
13
|
## SYNOPSIS:
|
14
14
|
|
data/lib/phrasie.rb
CHANGED
data/lib/phrasie/extractor.rb
CHANGED
@@ -7,14 +7,15 @@ module Phrasie
|
|
7
7
|
|
8
8
|
def initialize(options={})
|
9
9
|
self.tagger = Tagger.new
|
10
|
-
self.filter =
|
10
|
+
self.filter = {:strength => 2, :occur => 3}.merge(options[:filter] || {})
|
11
11
|
end
|
12
12
|
|
13
13
|
def to_s
|
14
14
|
"#<Phrasie::Extractor>"
|
15
15
|
end
|
16
|
-
|
17
|
-
|
16
|
+
|
17
|
+
# Returns an array of [phrase, occurances, # of words in phrase]
|
18
|
+
def phrases(input, filter=nil)
|
18
19
|
if input.is_a? String
|
19
20
|
taggedTerms = self.tagger.tag(input)
|
20
21
|
elsif input.is_a? Array
|
@@ -22,6 +23,13 @@ module Phrasie
|
|
22
23
|
else
|
23
24
|
return []
|
24
25
|
end
|
26
|
+
|
27
|
+
unless filter.nil?
|
28
|
+
self.filter = self.filter.merge(filter)
|
29
|
+
if self.filter[:occur].to_s[/%/]
|
30
|
+
self.filter[:occur] = [(taggedTerms.size * 0.01), 2].sort.last.round
|
31
|
+
end
|
32
|
+
end
|
25
33
|
|
26
34
|
terms = {}
|
27
35
|
multiterm = []
|
@@ -29,15 +37,15 @@ module Phrasie
|
|
29
37
|
|
30
38
|
while taggedTerms.size > 0
|
31
39
|
term, tag, norm = taggedTerms.shift
|
32
|
-
if state == SEARCH && tag[0] == "N"
|
40
|
+
if state == SEARCH && tag[0,1] == "N"
|
33
41
|
state = NOUN
|
34
42
|
add(term, norm, multiterm, terms)
|
35
|
-
elsif state == SEARCH && tag == 'JJ' && term[0].upcase == term[0]
|
43
|
+
elsif state == SEARCH && tag == 'JJ' && term[0,1].upcase == term[0,1]
|
36
44
|
state = NOUN
|
37
45
|
add(term, norm, multiterm, terms)
|
38
|
-
elsif state == NOUN && tag[0] == "N"
|
46
|
+
elsif state == NOUN && tag[0,1] == "N"
|
39
47
|
add(term, norm, multiterm, terms)
|
40
|
-
elsif state == NOUN && tag[0] != "N"
|
48
|
+
elsif state == NOUN && tag[0,1] != "N"
|
41
49
|
state = SEARCH
|
42
50
|
if multiterm.size > 1
|
43
51
|
word = multiterm.map(&:first).join(' ')
|
@@ -47,18 +55,21 @@ module Phrasie
|
|
47
55
|
multiterm = []
|
48
56
|
end
|
49
57
|
end
|
50
|
-
|
58
|
+
|
51
59
|
return terms \
|
52
60
|
.map{|phrase, occurance| [phrase, occurance, phrase.split.size] } \
|
53
|
-
.
|
61
|
+
.delete_if{|arr| !self.validate(*arr)} \
|
54
62
|
.sort_by{|phrase, occurance, strength| occurance + ((occurance/5.0)*strength) }.reverse
|
55
63
|
end
|
56
64
|
|
57
65
|
protected
|
66
|
+
|
67
|
+
# Validates the phrase is within the bounds of our filter
|
58
68
|
def validate(word, occur, strength)
|
59
69
|
occur >= self.filter[:occur] || (occur >= 2 && strength >= self.filter[:strength])
|
60
70
|
end
|
61
|
-
|
71
|
+
|
72
|
+
# Used within #phrases
|
62
73
|
def add(term, norm, multiterm, terms)
|
63
74
|
multiterm << [term, norm]
|
64
75
|
terms[norm] ||= 0
|
data/lib/phrasie/tag.rb
CHANGED
@@ -11,6 +11,7 @@ module Phrasie
|
|
11
11
|
self.tags_by_term = Hash[file.split("\n").map{|line| line.split.first(2)}]
|
12
12
|
end
|
13
13
|
|
14
|
+
# Takes some input text and outputs an array of the words contained in it.
|
14
15
|
def tokenize(text)
|
15
16
|
terms = []
|
16
17
|
text.split(/\s/).each do |term|
|
@@ -29,6 +30,8 @@ module Phrasie
|
|
29
30
|
return terms
|
30
31
|
end
|
31
32
|
|
33
|
+
# Takes an array from #tokenize, or a string which it pipes through #tokenize,
|
34
|
+
# and returns the words with part-of-speech tags.
|
32
35
|
def tag(input)
|
33
36
|
if input.is_a? String
|
34
37
|
terms = self.tokenize(input)
|
@@ -43,7 +46,8 @@ module Phrasie
|
|
43
46
|
tag = self.tags_by_term[term] || "NND"
|
44
47
|
tagged_terms << [term, tag, term]
|
45
48
|
end
|
46
|
-
|
49
|
+
|
50
|
+
# These rules are definied in rules.rb
|
47
51
|
rules = [
|
48
52
|
'correctDefaultNounTag',
|
49
53
|
'verifyProperNounAtSentenceStart',
|
@@ -56,7 +60,9 @@ module Phrasie
|
|
56
60
|
id, tagged_terms[id], tagged_terms = self.send(rule.to_sym, id, tagged_term, tagged_terms)
|
57
61
|
end
|
58
62
|
end
|
63
|
+
|
59
64
|
return tagged_terms
|
60
65
|
end
|
66
|
+
|
61
67
|
end
|
62
68
|
end
|
data/phrasie.gemspec
CHANGED
data/test/test_phrasie.rb
CHANGED
@@ -64,31 +64,6 @@ class TestPhrasie < Test::Unit::TestCase
|
|
64
64
|
assert_equal 7, @extractor.phrases(text).size
|
65
65
|
end
|
66
66
|
|
67
|
-
# [["Jerusalem", 8, 1],
|
68
|
-
# ["event", 6, 1],
|
69
|
-
# ["Palestinian", 6, 1],
|
70
|
-
# ["East Jerusalem", 4, 2],
|
71
|
-
# ["East", 4, 1],
|
72
|
-
# ["police", 4, 1],
|
73
|
-
# ["Israel", 4, 1],
|
74
|
-
# ["theatre", 3, 1],
|
75
|
-
# ["Palestinian theatre", 2, 2],
|
76
|
-
# ["Palestinian Authority", 2, 2],
|
77
|
-
# ["opening event", 1, 2],
|
78
|
-
# ["Israeli authorities", 1, 2],
|
79
|
-
# ["Richard Makepeace", 1, 2],
|
80
|
-
# ["court order", 1, 2],
|
81
|
-
# ["literature festival", 1, 2],
|
82
|
-
# ["British consul-general", 1, 2],
|
83
|
-
# ["police notice", 1, 2],
|
84
|
-
# ["security minister", 1, 2],
|
85
|
-
# ["Israeli police", 1, 2],
|
86
|
-
# ["peace accords", 1, 2],
|
87
|
-
# ["Mr Makepeace", 1, 2],
|
88
|
-
# ["British Council", 1, 2],
|
89
|
-
# ["Palestinian state", 1, 2],
|
90
|
-
# ["Palestinians hope", 1, 2]]
|
91
|
-
|
92
67
|
def test_long_text
|
93
68
|
assert_equal 10, @extractor.phrases(@long_text).size
|
94
69
|
end
|