phrasie 0.1.3 → 0.1.4

Sign up to get free protection for your applications and to get access to all the features.
data/README.md CHANGED
@@ -8,7 +8,7 @@ statistical analysis to determine the terms and their strength.
8
8
 
9
9
  Based on the excellent Python library [topia.termextract](http://pypi.python.org/pypi/topia.termextract/).
10
10
 
11
- **Tested on Ruby 1.9.2.**
11
+ **Tested on Ruby 1.8.7 and 1.9.2.**
12
12
 
13
13
  ## SYNOPSIS:
14
14
 
@@ -3,6 +3,8 @@ $:.unshift(File.dirname(__FILE__)) unless
3
3
 
4
4
  require 'phrasie/rules'
5
5
  require 'phrasie/tag'
6
- require 'phrasie/extractor'
7
-
8
- VERSION = '0.1.3'
6
+ require 'phrasie/extractor'
7
+
8
+ module Phrasie
9
+ VERSION = '0.1.4'
10
+ end
@@ -7,14 +7,15 @@ module Phrasie
7
7
 
8
8
  def initialize(options={})
9
9
  self.tagger = Tagger.new
10
- self.filter = options[:filter] || {:strength => 2, :occur => 3}
10
+ self.filter = {:strength => 2, :occur => 3}.merge(options[:filter] || {})
11
11
  end
12
12
 
13
13
  def to_s
14
14
  "#<Phrasie::Extractor>"
15
15
  end
16
-
17
- def phrases(input, min_occur=3)
16
+
17
+ # Returns an array of [phrase, occurances, # of words in phrase]
18
+ def phrases(input, filter=nil)
18
19
  if input.is_a? String
19
20
  taggedTerms = self.tagger.tag(input)
20
21
  elsif input.is_a? Array
@@ -22,6 +23,13 @@ module Phrasie
22
23
  else
23
24
  return []
24
25
  end
26
+
27
+ unless filter.nil?
28
+ self.filter = self.filter.merge(filter)
29
+ if self.filter[:occur].to_s[/%/]
30
+ self.filter[:occur] = [(taggedTerms.size * 0.01), 2].sort.last.round
31
+ end
32
+ end
25
33
 
26
34
  terms = {}
27
35
  multiterm = []
@@ -29,15 +37,15 @@ module Phrasie
29
37
 
30
38
  while taggedTerms.size > 0
31
39
  term, tag, norm = taggedTerms.shift
32
- if state == SEARCH && tag[0] == "N"
40
+ if state == SEARCH && tag[0,1] == "N"
33
41
  state = NOUN
34
42
  add(term, norm, multiterm, terms)
35
- elsif state == SEARCH && tag == 'JJ' && term[0].upcase == term[0]
43
+ elsif state == SEARCH && tag == 'JJ' && term[0,1].upcase == term[0,1]
36
44
  state = NOUN
37
45
  add(term, norm, multiterm, terms)
38
- elsif state == NOUN && tag[0] == "N"
46
+ elsif state == NOUN && tag[0,1] == "N"
39
47
  add(term, norm, multiterm, terms)
40
- elsif state == NOUN && tag[0] != "N"
48
+ elsif state == NOUN && tag[0,1] != "N"
41
49
  state = SEARCH
42
50
  if multiterm.size > 1
43
51
  word = multiterm.map(&:first).join(' ')
@@ -47,18 +55,21 @@ module Phrasie
47
55
  multiterm = []
48
56
  end
49
57
  end
50
-
58
+
51
59
  return terms \
52
60
  .map{|phrase, occurance| [phrase, occurance, phrase.split.size] } \
53
- .keep_if{|arr| self.validate(*arr)} \
61
+ .delete_if{|arr| !self.validate(*arr)} \
54
62
  .sort_by{|phrase, occurance, strength| occurance + ((occurance/5.0)*strength) }.reverse
55
63
  end
56
64
 
57
65
  protected
66
+
67
+ # Validates the phrase is within the bounds of our filter
58
68
  def validate(word, occur, strength)
59
69
  occur >= self.filter[:occur] || (occur >= 2 && strength >= self.filter[:strength])
60
70
  end
61
-
71
+
72
+ # Used within #phrases
62
73
  def add(term, norm, multiterm, terms)
63
74
  multiterm << [term, norm]
64
75
  terms[norm] ||= 0
@@ -11,6 +11,7 @@ module Phrasie
11
11
  self.tags_by_term = Hash[file.split("\n").map{|line| line.split.first(2)}]
12
12
  end
13
13
 
14
+ # Takes some input text and outputs an array of the words contained in it.
14
15
  def tokenize(text)
15
16
  terms = []
16
17
  text.split(/\s/).each do |term|
@@ -29,6 +30,8 @@ module Phrasie
29
30
  return terms
30
31
  end
31
32
 
33
+ # Takes an array from #tokenize, or a string which it pipes through #tokenize,
34
+ # and returns the words with part-of-speech tags.
32
35
  def tag(input)
33
36
  if input.is_a? String
34
37
  terms = self.tokenize(input)
@@ -43,7 +46,8 @@ module Phrasie
43
46
  tag = self.tags_by_term[term] || "NND"
44
47
  tagged_terms << [term, tag, term]
45
48
  end
46
-
49
+
50
+ # These rules are definied in rules.rb
47
51
  rules = [
48
52
  'correctDefaultNounTag',
49
53
  'verifyProperNounAtSentenceStart',
@@ -56,7 +60,9 @@ module Phrasie
56
60
  id, tagged_terms[id], tagged_terms = self.send(rule.to_sym, id, tagged_term, tagged_terms)
57
61
  end
58
62
  end
63
+
59
64
  return tagged_terms
60
65
  end
66
+
61
67
  end
62
68
  end
@@ -1,6 +1,6 @@
1
1
  Gem::Specification.new do |s|
2
2
  s.name = "phrasie"
3
- s.version = '0.1.3'
3
+ s.version = '0.1.4'
4
4
  s.authors = ["Ashley Williams"]
5
5
  s.email = ["hi@ashleyw.co.uk"]
6
6
  s.summary = "Determines important terms within a given piece of content."
@@ -64,31 +64,6 @@ class TestPhrasie < Test::Unit::TestCase
64
64
  assert_equal 7, @extractor.phrases(text).size
65
65
  end
66
66
 
67
- # [["Jerusalem", 8, 1],
68
- # ["event", 6, 1],
69
- # ["Palestinian", 6, 1],
70
- # ["East Jerusalem", 4, 2],
71
- # ["East", 4, 1],
72
- # ["police", 4, 1],
73
- # ["Israel", 4, 1],
74
- # ["theatre", 3, 1],
75
- # ["Palestinian theatre", 2, 2],
76
- # ["Palestinian Authority", 2, 2],
77
- # ["opening event", 1, 2],
78
- # ["Israeli authorities", 1, 2],
79
- # ["Richard Makepeace", 1, 2],
80
- # ["court order", 1, 2],
81
- # ["literature festival", 1, 2],
82
- # ["British consul-general", 1, 2],
83
- # ["police notice", 1, 2],
84
- # ["security minister", 1, 2],
85
- # ["Israeli police", 1, 2],
86
- # ["peace accords", 1, 2],
87
- # ["Mr Makepeace", 1, 2],
88
- # ["British Council", 1, 2],
89
- # ["Palestinian state", 1, 2],
90
- # ["Palestinians hope", 1, 2]]
91
-
92
67
  def test_long_text
93
68
  assert_equal 10, @extractor.phrases(@long_text).size
94
69
  end
metadata CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
5
5
  segments:
6
6
  - 0
7
7
  - 1
8
- - 3
9
- version: 0.1.3
8
+ - 4
9
+ version: 0.1.4
10
10
  platform: ruby
11
11
  authors:
12
12
  - Ashley Williams