phrasie 0.1.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,68 @@
1
+ SEARCH = 0
2
+ NOUN = 1
3
+
4
+ module Phrasie
5
+ class Extractor
6
+ attr_accessor :tagger, :filter
7
+
8
+ def initialize(options={})
9
+ self.tagger = Tagger.new
10
+ self.filter = options[:filter] || {:strength => 2, :occur => 3}
11
+ end
12
+
13
+ def to_s
14
+ "#<Phrasie::Extractor>"
15
+ end
16
+
17
+ def phrases(input, min_occur=3)
18
+ if input.is_a? String
19
+ taggedTerms = self.tagger.tag(input)
20
+ elsif input.is_a? Array
21
+ taggedTerms = input
22
+ else
23
+ return []
24
+ end
25
+
26
+ terms = {}
27
+ multiterm = []
28
+ state = SEARCH
29
+
30
+ while taggedTerms.size > 0
31
+ term, tag, norm = taggedTerms.shift
32
+ if state == SEARCH && tag[0] == "N"
33
+ state = NOUN
34
+ add(term, norm, multiterm, terms)
35
+ elsif state == SEARCH && tag == 'JJ' && term[0].upcase == term[0]
36
+ state = NOUN
37
+ add(term, norm, multiterm, terms)
38
+ elsif state == NOUN && tag[0] == "N"
39
+ add(term, norm, multiterm, terms)
40
+ elsif state == NOUN && tag[0] != "N"
41
+ state = SEARCH
42
+ if multiterm.size > 1
43
+ word = multiterm.map(&:first).join(' ')
44
+ terms[word] ||= 0
45
+ terms[word] += 1
46
+ end
47
+ multiterm = []
48
+ end
49
+ end
50
+
51
+ return terms \
52
+ .map{|phrase, occurance| [phrase, occurance, phrase.split.size] } \
53
+ .keep_if{|arr| self.validate(*arr)} \
54
+ .sort_by{|phrase, occurance, strength| occurance + ((occurance/5.0)*strength) }.reverse
55
+ end
56
+
57
+ protected
58
+ def validate(word, occur, strength)
59
+ occur >= self.filter[:occur] || (occur >= 2 && strength >= self.filter[:strength])
60
+ end
61
+
62
+ def add(term, norm, multiterm, terms)
63
+ multiterm << [term, norm]
64
+ terms[norm] ||= 0
65
+ terms[norm] += 1
66
+ end
67
+ end
68
+ end
@@ -0,0 +1,82 @@
1
+ module Phrasie
2
+ module Rules
3
+ # Determine whether a default noun is plural or singular.
4
+ def correctDefaultNounTag(id, tagged_term, tagged_terms)
5
+ term, tag, norm = tagged_term
6
+ if tag == 'NND'
7
+ if term[-1] == 's'
8
+ tagged_term[1] = 'NNS'
9
+ tagged_term[2] = term[0..-2]
10
+ else
11
+ tagged_term[1] = 'NN'
12
+ end
13
+ end
14
+ return [id, tagged_term, tagged_terms]
15
+ end
16
+
17
+ # Verify that noun at sentence start is truly proper.
18
+ def verifyProperNounAtSentenceStart(id, tagged_term, tagged_terms)
19
+ term, tag, norm = tagged_term
20
+ if ['NNP', 'NNPS'].include?(tag) && (id == 0 || tagged_terms[id-1][1] == '.')
21
+ lower_term = term.downcase
22
+ lower_tag = self.tags_by_term[lower_term]
23
+ if ['NN', 'NNS'].include?(lower_tag)
24
+ tagged_term[0] = tagged_term[2] = lower_term
25
+ tagged_term[1] = lower_tag
26
+ end
27
+ end
28
+ return [id, tagged_term, tagged_terms]
29
+ end
30
+
31
+ # Determine the verb after a modal verb to avoid accidental noun detection.
32
+ def determineVerbAfterModal(id, tagged_term, tagged_terms)
33
+ term, tag, norm = tagged_term
34
+ return [id, tagged_term, tagged_terms] if tag != 'MD'
35
+ len_terms = tagged_terms.size
36
+ i = id
37
+ i += 1
38
+ while i < len_terms
39
+ if tagged_terms[i][1] == 'RB'
40
+ i += 1
41
+ next
42
+ end
43
+
44
+ if tagged_terms[i][1] == 'NN'
45
+ tagged_terms[i][1] = 'VB'
46
+ end
47
+
48
+ break
49
+ end
50
+
51
+ return [id, tagged_term, tagged_terms]
52
+ end
53
+
54
+
55
+ def normalizePluralForms(id, tagged_term, tagged_terms)
56
+ term, tag, norm = tagged_term
57
+ if ['NNS', 'NNPS'].include?(tag) && term == norm
58
+ # Plural form ends in "s"
59
+ singular = term[0..-2]
60
+ if term[-1] && !self.tags_by_term[singular].nil?
61
+ tagged_term[2] = singular
62
+ return [id, tagged_term, tagged_terms]
63
+ end
64
+
65
+ # Plural form ends in "es"
66
+ singular = term[0..-3]
67
+ if term[-2..-1] && !self.tags_by_term[singular].nil?
68
+ tagged_term[2] = singular
69
+ return [id, tagged_term, tagged_terms]
70
+ end
71
+
72
+ # Plural form ends in "ies" (from "y")
73
+ singular = term[0..-4]+'y'
74
+ if term[-3..-1] && !self.tags_by_term[singular].nil?
75
+ tagged_term[2] = singular
76
+ return [id, tagged_term, tagged_terms]
77
+ end
78
+ end
79
+ return [id, tagged_term, tagged_terms]
80
+ end
81
+ end
82
+ end
@@ -0,0 +1,62 @@
1
+ module Phrasie
2
+ class Tagger
3
+ include Phrasie::Rules
4
+ TERM_SPEC = /([^a-zA-Z]*)([a-zA-Z\-\.]*[a-zA-Z])([^a-zA-Z]*[a-zA-Z]*)/
5
+ attr_accessor :language, :tags_by_term, :lexicon
6
+
7
+ def initialize(options={})
8
+ self.language = options[:language] || 'english'
9
+ self.lexicon = options[:lexicon] || File.expand_path("#{__FILE__}/../data/#{self.language}-lexicon.txt")
10
+ file = File.read(self.lexicon)
11
+ self.tags_by_term = Hash[file.split("\n").map{|line| line.split.first(2)}]
12
+ end
13
+
14
+ def tokenize(text)
15
+ terms = []
16
+ text.split(/\s/).each do |term|
17
+ next if term == ''
18
+ match = TERM_SPEC.match(term).to_a
19
+ match.shift
20
+ if match.size == 0
21
+ terms << term
22
+ next
23
+ end
24
+
25
+ match.each do |sub_term|
26
+ terms << sub_term if sub_term != ''
27
+ end
28
+ end
29
+ return terms
30
+ end
31
+
32
+ def tag(input)
33
+ if input.is_a? String
34
+ terms = self.tokenize(input)
35
+ elsif input.is_a? Array
36
+ terms = input
37
+ else
38
+ return []
39
+ end
40
+
41
+ tagged_terms = []
42
+ terms.each do |term|
43
+ tag = self.tags_by_term[term] || "NND"
44
+ tagged_terms << [term, tag, term]
45
+ end
46
+
47
+ rules = [
48
+ 'correctDefaultNounTag',
49
+ 'verifyProperNounAtSentenceStart',
50
+ 'determineVerbAfterModal',
51
+ 'normalizePluralForms'
52
+ ]
53
+
54
+ tagged_terms.each_with_index do |tagged_term, id|
55
+ rules.each do |rule|
56
+ id, tagged_terms[id], tagged_terms = self.send(rule.to_sym, id, tagged_term, tagged_terms)
57
+ end
58
+ end
59
+ return tagged_terms
60
+ end
61
+ end
62
+ end
data/phrasie.gemspec ADDED
@@ -0,0 +1,17 @@
1
+ Gem::Specification.new do |s|
2
+ s.name = "phrasie"
3
+ s.version = '0.1.3'
4
+ s.authors = ["Ashley Williams"]
5
+ s.email = ["hi@ashleyw.co.uk"]
6
+ s.summary = "Determines important terms within a given piece of content."
7
+ s.homepage = "https://github.com/ashleyw/Phrasie"
8
+ s.description = "Determines important terms within a given piece of content. It
9
+ uses linguistic tools such as Parts-Of-Speech (POS) and some simple
10
+ statistical analysis to determine the terms and their strength."
11
+
12
+ s.required_rubygems_version = ">= 1.3.6"
13
+ s.rubyforge_project = "phrasie"
14
+
15
+ s.files = File.read("Manifest.txt").split("\n")
16
+ s.require_path = 'lib'
17
+ end
data/script/console ADDED
@@ -0,0 +1,10 @@
1
+ #!/usr/bin/env ruby
2
+ # File: script/console
3
+ irb = RUBY_PLATFORM =~ /(:?mswin|mingw)/ ? 'irb.bat' : 'irb'
4
+
5
+ libs = " -r irb/completion"
6
+ # Perhaps use a console_lib to store any extra methods I may want available in the cosole
7
+ # libs << " -r #{File.dirname(__FILE__) + '/../lib/console_lib/console_logger.rb'}"
8
+ libs << " -r #{File.dirname(__FILE__) + '/../lib/phrasie.rb'}"
9
+ puts "Loading phrasie gem"
10
+ exec "#{irb} #{libs} --simple-prompt"
data/script/destroy ADDED
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+ APP_ROOT = File.expand_path(File.join(File.dirname(__FILE__), '..'))
3
+
4
+ begin
5
+ require 'rubigen'
6
+ rescue LoadError
7
+ require 'rubygems'
8
+ require 'rubigen'
9
+ end
10
+ require 'rubigen/scripts/destroy'
11
+
12
+ ARGV.shift if ['--help', '-h'].include?(ARGV[0])
13
+ RubiGen::Base.use_component_sources! [:rubygems, :newgem, :newgem_theme, :test_unit]
14
+ RubiGen::Scripts::Destroy.new.run(ARGV)
data/script/generate ADDED
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+ APP_ROOT = File.expand_path(File.join(File.dirname(__FILE__), '..'))
3
+
4
+ begin
5
+ require 'rubigen'
6
+ rescue LoadError
7
+ require 'rubygems'
8
+ require 'rubigen'
9
+ end
10
+ require 'rubigen/scripts/generate'
11
+
12
+ ARGV.shift if ['--help', '-h'].include?(ARGV[0])
13
+ RubiGen::Base.use_component_sources! [:rubygems, :newgem, :newgem_theme, :test_unit]
14
+ RubiGen::Scripts::Generate.new.run(ARGV)
@@ -0,0 +1,3 @@
1
+ require 'stringio'
2
+ require 'test/unit'
3
+ require File.dirname(__FILE__) + '/../lib/phrasie'
@@ -0,0 +1,104 @@
1
+ require File.dirname(__FILE__) + '/test_helper.rb'
2
+
3
+ class TestPhrasie < Test::Unit::TestCase
4
+ def setup
5
+ @text = 'The British consul of Boston resides in Newton. The British consul is awesome.'
6
+ @long_text = %(Police shut Palestinian theatre in Jerusalem.
7
+
8
+ Israeli police have shut down a Palestinian theatre in East Jerusalem.
9
+
10
+ The action, on Thursday, prevented the closing event of an international
11
+ literature festival from taking place.
12
+
13
+ Police said they were acting on a court order, issued after intelligence
14
+ indicated that the Palestinian Authority was involved in the event.
15
+
16
+ Israel has occupied East Jerusalem since 1967 and has annexed the
17
+ area. This is not recognised by the international community.
18
+
19
+ The British consul-general in Jerusalem , Richard Makepeace, was
20
+ attending the event.
21
+
22
+ "I think all lovers of literature would regard this as a very
23
+ regrettable moment and regrettable decision," he added.
24
+
25
+ Mr Makepeace said the festival's closing event would be reorganised to
26
+ take place at the British Council in Jerusalem.
27
+
28
+ The Israeli authorities often take action against events in East
29
+ Jerusalem they see as connected to the Palestinian Authority.
30
+
31
+ Saturday's opening event at the same theatre was also shut down.
32
+
33
+ A police notice said the closure was on the orders of Israel's internal
34
+ security minister on the grounds of a breach of interim peace accords
35
+ from the 1990s.
36
+
37
+ These laid the framework for talks on establishing a Palestinian state
38
+ alongside Israel, but left the status of Jerusalem to be determined by
39
+ further negotiation.
40
+
41
+ Israel has annexed East Jerusalem and declares it part of its eternal
42
+ capital.
43
+
44
+ Palestinians hope to establish their capital in the area.)
45
+ @extractor = Phrasie::Extractor.new
46
+ end
47
+
48
+ def test_extractor
49
+ expected = [["British consul", 2, 2]]
50
+ assert_equal expected, @extractor.phrases(@text).sort_by{|a| a[1]}
51
+ end
52
+
53
+ def test_non_words
54
+ text = %(LONDON - WikiLeaks founder Julian Assange was refused bail and jailed for a week by a British court Tuesday, pending an extradition hearing over alleged sex offenses in Sweden.
55
+ Assange turned himself in to U.K. police earlier in the day in the latest blow to his WikiLeaks organization, which faces legal, financial and technological challenges after releasing hundreds of secret U.S. diplomatic cables.
56
+ Swedish prosecutors had issued an arrest warrant for the 39-year-old Australian, who is accused of sexual misconduct with two women.
57
+ Assange surrendered at 9:30 a.m. local time (4:30 a.m. ET) Tuesday. The U.K.'s Guardian newspaper reported that Assange later arrived at a London court accompanied by British lawyers Mark Stephens and Jennifer Robinson.
58
+ During his court appearance, Assange said he would fight extradition to Sweden and he provided the court with an Australian address. Britain's Sky News reported that Assange was receiving consular assistance from officials at the Australian High Commission.
59
+ The next court hearing is scheduled for next Tuesday, and Assange will remain in custody until then because he was deemed to be a flight risk.
60
+ Judge Howard Riddle asked Assange whether he understood that he could agree to be extradited to Sweden. Assange, dressed in a navy blue suit, cleared his throat and said: "I understand that and I do not consent."
61
+ The judge said he had grounds to believe that the former computer hacker - a self-described homeless refugee - might not show up to his next hearing if he were granted bail.
62
+ Arguments during the hour-long hearing detailed the sex accusations against Assange, all of which he has denied.
63
+ Australian journalist John Pilger, British film director Ken Loach and Jemima Khan, former wife of Pakistani cricketer and politician Imran Khan, all offered to put up sureties to persuade the court Assange would not flee.)
64
+ assert_equal 7, @extractor.phrases(text).size
65
+ end
66
+
67
+ # [["Jerusalem", 8, 1],
68
+ # ["event", 6, 1],
69
+ # ["Palestinian", 6, 1],
70
+ # ["East Jerusalem", 4, 2],
71
+ # ["East", 4, 1],
72
+ # ["police", 4, 1],
73
+ # ["Israel", 4, 1],
74
+ # ["theatre", 3, 1],
75
+ # ["Palestinian theatre", 2, 2],
76
+ # ["Palestinian Authority", 2, 2],
77
+ # ["opening event", 1, 2],
78
+ # ["Israeli authorities", 1, 2],
79
+ # ["Richard Makepeace", 1, 2],
80
+ # ["court order", 1, 2],
81
+ # ["literature festival", 1, 2],
82
+ # ["British consul-general", 1, 2],
83
+ # ["police notice", 1, 2],
84
+ # ["security minister", 1, 2],
85
+ # ["Israeli police", 1, 2],
86
+ # ["peace accords", 1, 2],
87
+ # ["Mr Makepeace", 1, 2],
88
+ # ["British Council", 1, 2],
89
+ # ["Palestinian state", 1, 2],
90
+ # ["Palestinians hope", 1, 2]]
91
+
92
+ def test_long_text
93
+ assert_equal 10, @extractor.phrases(@long_text).size
94
+ end
95
+
96
+ def test_filter_options
97
+ @extractor.filter = {:occur => 4, :strength => 3}
98
+ assert_equal 7, @extractor.phrases(@long_text).size
99
+ end
100
+
101
+ def test_extractor_to_s
102
+ assert @extractor.to_s == "#<Phrasie::Extractor>"
103
+ end
104
+ end
metadata ADDED
@@ -0,0 +1,83 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: phrasie
3
+ version: !ruby/object:Gem::Version
4
+ prerelease: false
5
+ segments:
6
+ - 0
7
+ - 1
8
+ - 3
9
+ version: 0.1.3
10
+ platform: ruby
11
+ authors:
12
+ - Ashley Williams
13
+ autorequire:
14
+ bindir: bin
15
+ cert_chain: []
16
+
17
+ date: 2011-01-17 00:00:00 +00:00
18
+ default_executable:
19
+ dependencies: []
20
+
21
+ description: |-
22
+ Determines important terms within a given piece of content. It
23
+ uses linguistic tools such as Parts-Of-Speech (POS) and some simple
24
+ statistical analysis to determine the terms and their strength.
25
+ email:
26
+ - hi@ashleyw.co.uk
27
+ executables: []
28
+
29
+ extensions: []
30
+
31
+ extra_rdoc_files: []
32
+
33
+ files:
34
+ - Manifest.txt
35
+ - README.md
36
+ - Rakefile
37
+ - lib/phrasie.rb
38
+ - lib/phrasie/extractor.rb
39
+ - lib/phrasie/rules.rb
40
+ - lib/phrasie/tag.rb
41
+ - lib/phrasie/data/english-lexicon.txt
42
+ - script/console
43
+ - script/destroy
44
+ - script/generate
45
+ - phrasie.gemspec
46
+ - test/test_helper.rb
47
+ - test/test_phrasie.rb
48
+ has_rdoc: true
49
+ homepage: https://github.com/ashleyw/Phrasie
50
+ licenses: []
51
+
52
+ post_install_message:
53
+ rdoc_options: []
54
+
55
+ require_paths:
56
+ - lib
57
+ required_ruby_version: !ruby/object:Gem::Requirement
58
+ none: false
59
+ requirements:
60
+ - - ">="
61
+ - !ruby/object:Gem::Version
62
+ segments:
63
+ - 0
64
+ version: "0"
65
+ required_rubygems_version: !ruby/object:Gem::Requirement
66
+ none: false
67
+ requirements:
68
+ - - ">="
69
+ - !ruby/object:Gem::Version
70
+ segments:
71
+ - 1
72
+ - 3
73
+ - 6
74
+ version: 1.3.6
75
+ requirements: []
76
+
77
+ rubyforge_project: phrasie
78
+ rubygems_version: 1.3.7
79
+ signing_key:
80
+ specification_version: 3
81
+ summary: Determines important terms within a given piece of content.
82
+ test_files: []
83
+