phrasie 0.1.3 → 0.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +1 -1
- data/lib/phrasie.rb +5 -3
- data/lib/phrasie/extractor.rb +21 -10
- data/lib/phrasie/tag.rb +7 -1
- data/phrasie.gemspec +1 -1
- data/test/test_phrasie.rb +0 -25
- metadata +2 -2
data/README.md
CHANGED
@@ -8,7 +8,7 @@ statistical analysis to determine the terms and their strength.
|
|
8
8
|
|
9
9
|
Based on the excellent Python library [topia.termextract](http://pypi.python.org/pypi/topia.termextract/).
|
10
10
|
|
11
|
-
**Tested on Ruby 1.9.2.**
|
11
|
+
**Tested on Ruby 1.8.7 and 1.9.2.**
|
12
12
|
|
13
13
|
## SYNOPSIS:
|
14
14
|
|
data/lib/phrasie.rb
CHANGED
data/lib/phrasie/extractor.rb
CHANGED
@@ -7,14 +7,15 @@ module Phrasie
|
|
7
7
|
|
8
8
|
def initialize(options={})
|
9
9
|
self.tagger = Tagger.new
|
10
|
-
self.filter =
|
10
|
+
self.filter = {:strength => 2, :occur => 3}.merge(options[:filter] || {})
|
11
11
|
end
|
12
12
|
|
13
13
|
def to_s
|
14
14
|
"#<Phrasie::Extractor>"
|
15
15
|
end
|
16
|
-
|
17
|
-
|
16
|
+
|
17
|
+
# Returns an array of [phrase, occurances, # of words in phrase]
|
18
|
+
def phrases(input, filter=nil)
|
18
19
|
if input.is_a? String
|
19
20
|
taggedTerms = self.tagger.tag(input)
|
20
21
|
elsif input.is_a? Array
|
@@ -22,6 +23,13 @@ module Phrasie
|
|
22
23
|
else
|
23
24
|
return []
|
24
25
|
end
|
26
|
+
|
27
|
+
unless filter.nil?
|
28
|
+
self.filter = self.filter.merge(filter)
|
29
|
+
if self.filter[:occur].to_s[/%/]
|
30
|
+
self.filter[:occur] = [(taggedTerms.size * 0.01), 2].sort.last.round
|
31
|
+
end
|
32
|
+
end
|
25
33
|
|
26
34
|
terms = {}
|
27
35
|
multiterm = []
|
@@ -29,15 +37,15 @@ module Phrasie
|
|
29
37
|
|
30
38
|
while taggedTerms.size > 0
|
31
39
|
term, tag, norm = taggedTerms.shift
|
32
|
-
if state == SEARCH && tag[0] == "N"
|
40
|
+
if state == SEARCH && tag[0,1] == "N"
|
33
41
|
state = NOUN
|
34
42
|
add(term, norm, multiterm, terms)
|
35
|
-
elsif state == SEARCH && tag == 'JJ' && term[0].upcase == term[0]
|
43
|
+
elsif state == SEARCH && tag == 'JJ' && term[0,1].upcase == term[0,1]
|
36
44
|
state = NOUN
|
37
45
|
add(term, norm, multiterm, terms)
|
38
|
-
elsif state == NOUN && tag[0] == "N"
|
46
|
+
elsif state == NOUN && tag[0,1] == "N"
|
39
47
|
add(term, norm, multiterm, terms)
|
40
|
-
elsif state == NOUN && tag[0] != "N"
|
48
|
+
elsif state == NOUN && tag[0,1] != "N"
|
41
49
|
state = SEARCH
|
42
50
|
if multiterm.size > 1
|
43
51
|
word = multiterm.map(&:first).join(' ')
|
@@ -47,18 +55,21 @@ module Phrasie
|
|
47
55
|
multiterm = []
|
48
56
|
end
|
49
57
|
end
|
50
|
-
|
58
|
+
|
51
59
|
return terms \
|
52
60
|
.map{|phrase, occurance| [phrase, occurance, phrase.split.size] } \
|
53
|
-
.
|
61
|
+
.delete_if{|arr| !self.validate(*arr)} \
|
54
62
|
.sort_by{|phrase, occurance, strength| occurance + ((occurance/5.0)*strength) }.reverse
|
55
63
|
end
|
56
64
|
|
57
65
|
protected
|
66
|
+
|
67
|
+
# Validates the phrase is within the bounds of our filter
|
58
68
|
def validate(word, occur, strength)
|
59
69
|
occur >= self.filter[:occur] || (occur >= 2 && strength >= self.filter[:strength])
|
60
70
|
end
|
61
|
-
|
71
|
+
|
72
|
+
# Used within #phrases
|
62
73
|
def add(term, norm, multiterm, terms)
|
63
74
|
multiterm << [term, norm]
|
64
75
|
terms[norm] ||= 0
|
data/lib/phrasie/tag.rb
CHANGED
@@ -11,6 +11,7 @@ module Phrasie
|
|
11
11
|
self.tags_by_term = Hash[file.split("\n").map{|line| line.split.first(2)}]
|
12
12
|
end
|
13
13
|
|
14
|
+
# Takes some input text and outputs an array of the words contained in it.
|
14
15
|
def tokenize(text)
|
15
16
|
terms = []
|
16
17
|
text.split(/\s/).each do |term|
|
@@ -29,6 +30,8 @@ module Phrasie
|
|
29
30
|
return terms
|
30
31
|
end
|
31
32
|
|
33
|
+
# Takes an array from #tokenize, or a string which it pipes through #tokenize,
|
34
|
+
# and returns the words with part-of-speech tags.
|
32
35
|
def tag(input)
|
33
36
|
if input.is_a? String
|
34
37
|
terms = self.tokenize(input)
|
@@ -43,7 +46,8 @@ module Phrasie
|
|
43
46
|
tag = self.tags_by_term[term] || "NND"
|
44
47
|
tagged_terms << [term, tag, term]
|
45
48
|
end
|
46
|
-
|
49
|
+
|
50
|
+
# These rules are definied in rules.rb
|
47
51
|
rules = [
|
48
52
|
'correctDefaultNounTag',
|
49
53
|
'verifyProperNounAtSentenceStart',
|
@@ -56,7 +60,9 @@ module Phrasie
|
|
56
60
|
id, tagged_terms[id], tagged_terms = self.send(rule.to_sym, id, tagged_term, tagged_terms)
|
57
61
|
end
|
58
62
|
end
|
63
|
+
|
59
64
|
return tagged_terms
|
60
65
|
end
|
66
|
+
|
61
67
|
end
|
62
68
|
end
|
data/phrasie.gemspec
CHANGED
data/test/test_phrasie.rb
CHANGED
@@ -64,31 +64,6 @@ class TestPhrasie < Test::Unit::TestCase
|
|
64
64
|
assert_equal 7, @extractor.phrases(text).size
|
65
65
|
end
|
66
66
|
|
67
|
-
# [["Jerusalem", 8, 1],
|
68
|
-
# ["event", 6, 1],
|
69
|
-
# ["Palestinian", 6, 1],
|
70
|
-
# ["East Jerusalem", 4, 2],
|
71
|
-
# ["East", 4, 1],
|
72
|
-
# ["police", 4, 1],
|
73
|
-
# ["Israel", 4, 1],
|
74
|
-
# ["theatre", 3, 1],
|
75
|
-
# ["Palestinian theatre", 2, 2],
|
76
|
-
# ["Palestinian Authority", 2, 2],
|
77
|
-
# ["opening event", 1, 2],
|
78
|
-
# ["Israeli authorities", 1, 2],
|
79
|
-
# ["Richard Makepeace", 1, 2],
|
80
|
-
# ["court order", 1, 2],
|
81
|
-
# ["literature festival", 1, 2],
|
82
|
-
# ["British consul-general", 1, 2],
|
83
|
-
# ["police notice", 1, 2],
|
84
|
-
# ["security minister", 1, 2],
|
85
|
-
# ["Israeli police", 1, 2],
|
86
|
-
# ["peace accords", 1, 2],
|
87
|
-
# ["Mr Makepeace", 1, 2],
|
88
|
-
# ["British Council", 1, 2],
|
89
|
-
# ["Palestinian state", 1, 2],
|
90
|
-
# ["Palestinians hope", 1, 2]]
|
91
|
-
|
92
67
|
def test_long_text
|
93
68
|
assert_equal 10, @extractor.phrases(@long_text).size
|
94
69
|
end
|