phrasie 0.1.3 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.md CHANGED
@@ -8,7 +8,7 @@ statistical analysis to determine the terms and their strength.
8
8
 
9
9
  Based on the excellent Python library [topia.termextract](http://pypi.python.org/pypi/topia.termextract/).
10
10
 
11
- **Tested on Ruby 1.9.2.**
11
+ **Tested on Ruby 1.8.7 and 1.9.2.**
12
12
 
13
13
  ## SYNOPSIS:
14
14
 
@@ -3,6 +3,8 @@ $:.unshift(File.dirname(__FILE__)) unless
3
3
 
4
4
  require 'phrasie/rules'
5
5
  require 'phrasie/tag'
6
- require 'phrasie/extractor'
7
-
8
- VERSION = '0.1.3'
6
+ require 'phrasie/extractor'
7
+
8
+ module Phrasie
9
+ VERSION = '0.1.4'
10
+ end
@@ -7,14 +7,15 @@ module Phrasie
7
7
 
8
8
  def initialize(options={})
9
9
  self.tagger = Tagger.new
10
- self.filter = options[:filter] || {:strength => 2, :occur => 3}
10
+ self.filter = {:strength => 2, :occur => 3}.merge(options[:filter] || {})
11
11
  end
12
12
 
13
13
  def to_s
14
14
  "#<Phrasie::Extractor>"
15
15
  end
16
-
17
- def phrases(input, min_occur=3)
16
+
17
+ # Returns an array of [phrase, occurances, # of words in phrase]
18
+ def phrases(input, filter=nil)
18
19
  if input.is_a? String
19
20
  taggedTerms = self.tagger.tag(input)
20
21
  elsif input.is_a? Array
@@ -22,6 +23,13 @@ module Phrasie
22
23
  else
23
24
  return []
24
25
  end
26
+
27
+ unless filter.nil?
28
+ self.filter = self.filter.merge(filter)
29
+ if self.filter[:occur].to_s[/%/]
30
+ self.filter[:occur] = [(taggedTerms.size * 0.01), 2].sort.last.round
31
+ end
32
+ end
25
33
 
26
34
  terms = {}
27
35
  multiterm = []
@@ -29,15 +37,15 @@ module Phrasie
29
37
 
30
38
  while taggedTerms.size > 0
31
39
  term, tag, norm = taggedTerms.shift
32
- if state == SEARCH && tag[0] == "N"
40
+ if state == SEARCH && tag[0,1] == "N"
33
41
  state = NOUN
34
42
  add(term, norm, multiterm, terms)
35
- elsif state == SEARCH && tag == 'JJ' && term[0].upcase == term[0]
43
+ elsif state == SEARCH && tag == 'JJ' && term[0,1].upcase == term[0,1]
36
44
  state = NOUN
37
45
  add(term, norm, multiterm, terms)
38
- elsif state == NOUN && tag[0] == "N"
46
+ elsif state == NOUN && tag[0,1] == "N"
39
47
  add(term, norm, multiterm, terms)
40
- elsif state == NOUN && tag[0] != "N"
48
+ elsif state == NOUN && tag[0,1] != "N"
41
49
  state = SEARCH
42
50
  if multiterm.size > 1
43
51
  word = multiterm.map(&:first).join(' ')
@@ -47,18 +55,21 @@ module Phrasie
47
55
  multiterm = []
48
56
  end
49
57
  end
50
-
58
+
51
59
  return terms \
52
60
  .map{|phrase, occurance| [phrase, occurance, phrase.split.size] } \
53
- .keep_if{|arr| self.validate(*arr)} \
61
+ .delete_if{|arr| !self.validate(*arr)} \
54
62
  .sort_by{|phrase, occurance, strength| occurance + ((occurance/5.0)*strength) }.reverse
55
63
  end
56
64
 
57
65
  protected
66
+
67
+ # Validates the phrase is within the bounds of our filter
58
68
  def validate(word, occur, strength)
59
69
  occur >= self.filter[:occur] || (occur >= 2 && strength >= self.filter[:strength])
60
70
  end
61
-
71
+
72
+ # Used within #phrases
62
73
  def add(term, norm, multiterm, terms)
63
74
  multiterm << [term, norm]
64
75
  terms[norm] ||= 0
@@ -11,6 +11,7 @@ module Phrasie
11
11
  self.tags_by_term = Hash[file.split("\n").map{|line| line.split.first(2)}]
12
12
  end
13
13
 
14
+ # Takes some input text and outputs an array of the words contained in it.
14
15
  def tokenize(text)
15
16
  terms = []
16
17
  text.split(/\s/).each do |term|
@@ -29,6 +30,8 @@ module Phrasie
29
30
  return terms
30
31
  end
31
32
 
33
+ # Takes an array from #tokenize, or a string which it pipes through #tokenize,
34
+ # and returns the words with part-of-speech tags.
32
35
  def tag(input)
33
36
  if input.is_a? String
34
37
  terms = self.tokenize(input)
@@ -43,7 +46,8 @@ module Phrasie
43
46
  tag = self.tags_by_term[term] || "NND"
44
47
  tagged_terms << [term, tag, term]
45
48
  end
46
-
49
+
50
+ # These rules are definied in rules.rb
47
51
  rules = [
48
52
  'correctDefaultNounTag',
49
53
  'verifyProperNounAtSentenceStart',
@@ -56,7 +60,9 @@ module Phrasie
56
60
  id, tagged_terms[id], tagged_terms = self.send(rule.to_sym, id, tagged_term, tagged_terms)
57
61
  end
58
62
  end
63
+
59
64
  return tagged_terms
60
65
  end
66
+
61
67
  end
62
68
  end
@@ -1,6 +1,6 @@
1
1
  Gem::Specification.new do |s|
2
2
  s.name = "phrasie"
3
- s.version = '0.1.3'
3
+ s.version = '0.1.4'
4
4
  s.authors = ["Ashley Williams"]
5
5
  s.email = ["hi@ashleyw.co.uk"]
6
6
  s.summary = "Determines important terms within a given piece of content."
@@ -64,31 +64,6 @@ class TestPhrasie < Test::Unit::TestCase
64
64
  assert_equal 7, @extractor.phrases(text).size
65
65
  end
66
66
 
67
- # [["Jerusalem", 8, 1],
68
- # ["event", 6, 1],
69
- # ["Palestinian", 6, 1],
70
- # ["East Jerusalem", 4, 2],
71
- # ["East", 4, 1],
72
- # ["police", 4, 1],
73
- # ["Israel", 4, 1],
74
- # ["theatre", 3, 1],
75
- # ["Palestinian theatre", 2, 2],
76
- # ["Palestinian Authority", 2, 2],
77
- # ["opening event", 1, 2],
78
- # ["Israeli authorities", 1, 2],
79
- # ["Richard Makepeace", 1, 2],
80
- # ["court order", 1, 2],
81
- # ["literature festival", 1, 2],
82
- # ["British consul-general", 1, 2],
83
- # ["police notice", 1, 2],
84
- # ["security minister", 1, 2],
85
- # ["Israeli police", 1, 2],
86
- # ["peace accords", 1, 2],
87
- # ["Mr Makepeace", 1, 2],
88
- # ["British Council", 1, 2],
89
- # ["Palestinian state", 1, 2],
90
- # ["Palestinians hope", 1, 2]]
91
-
92
67
  def test_long_text
93
68
  assert_equal 10, @extractor.phrases(@long_text).size
94
69
  end
metadata CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
5
5
  segments:
6
6
  - 0
7
7
  - 1
8
- - 3
9
- version: 0.1.3
8
+ - 4
9
+ version: 0.1.4
10
10
  platform: ruby
11
11
  authors:
12
12
  - Ashley Williams