term-extract 0.2.1 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- data/VERSION +1 -1
- data/lib/term-extract.rb +27 -14
- data/term-extract.gemspec +2 -2
- metadata +5 -5
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.3.0
|
data/lib/term-extract.rb
CHANGED
@@ -32,22 +32,13 @@ class TermExtract
|
|
32
32
|
|
33
33
|
def extract(content)
|
34
34
|
|
35
|
-
tagger = @@TAGGER.nil? ? Brill::Tagger.new : @@TAGGER
|
36
|
-
|
37
35
|
# Tidy content punctuation
|
38
36
|
# Add a space after periods
|
39
37
|
content.gsub!(/([A-Za-z0-9])\./, '\1. ')
|
40
|
-
|
41
|
-
tags
|
42
|
-
tagger.
|
43
|
-
|
44
|
-
tag[0].chop!
|
45
|
-
tags.push tag
|
46
|
-
tags.push ['.', '.']
|
47
|
-
else
|
48
|
-
tags.push tag
|
49
|
-
end
|
50
|
-
end
|
38
|
+
|
39
|
+
# Assign POS tags and tidy tag stack
|
40
|
+
tagger = @@TAGGER.nil? ? Brill::Tagger.new : @@TAGGER
|
41
|
+
tags = preprocess_tags(tagger.tag(content))
|
51
42
|
|
52
43
|
# Set pos tags that identify nouns
|
53
44
|
pos = "^NN"
|
@@ -79,7 +70,6 @@ class TermExtract
|
|
79
70
|
multiterm << [term,tag]
|
80
71
|
elsif state == @@NOUN and last_tag =~ /^(NNP|NNPS)$/ and tag == 'IN' and term =~ /(of|for|on|of\sthe|\&|d\'|du|de)/i
|
81
72
|
# Allow preposition : "Secretary of State"
|
82
|
-
# Doesn't support "Chair of the Parades Commission"
|
83
73
|
# Only use when in NNP mode
|
84
74
|
multiterm << [term,tag]
|
85
75
|
elsif state == @@NOUN and tag =~ /#{pos}/
|
@@ -115,6 +105,29 @@ class TermExtract
|
|
115
105
|
end
|
116
106
|
|
117
107
|
protected
|
108
|
+
def preprocess_tags(pos)
|
109
|
+
# Add in full stops to tag list to allow multiterms to work
|
110
|
+
tags = []
|
111
|
+
pos.each do |tag|
|
112
|
+
if tag[0] =~ /\.$/
|
113
|
+
tag[0].chop!
|
114
|
+
tags.push tag
|
115
|
+
tags.push ['.', '.']
|
116
|
+
else
|
117
|
+
tags.push tag
|
118
|
+
end
|
119
|
+
end
|
120
|
+
# Join certain prepositions together to allow them to be extracted
|
121
|
+
# e.g. allows 'News of the World' to be extracted
|
122
|
+
tags.each_with_index do |tag, index|
|
123
|
+
if tag[0] == 'of' && (index + 1) < tags.length && tags[index+1][0] == 'the'
|
124
|
+
tags[index][0] = 'of the'
|
125
|
+
tags.delete_at(index+1)
|
126
|
+
end
|
127
|
+
end
|
128
|
+
tags
|
129
|
+
end
|
130
|
+
|
118
131
|
def add_term(term, tag, multiterm, terms)
|
119
132
|
multiterm << ([term, tag])
|
120
133
|
increment_term(term, tag, terms)
|
data/term-extract.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{term-extract}
|
8
|
-
s.version = "0.
|
8
|
+
s.version = "0.3.0"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["rattle"]
|
12
|
-
s.date = %q{2010-12-
|
12
|
+
s.date = %q{2010-12-24}
|
13
13
|
s.email = %q{robl@rjlee.net}
|
14
14
|
s.extra_rdoc_files = [
|
15
15
|
"LICENSE.txt",
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: term-extract
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 19
|
5
5
|
prerelease: false
|
6
6
|
segments:
|
7
7
|
- 0
|
8
|
-
-
|
9
|
-
-
|
10
|
-
version: 0.
|
8
|
+
- 3
|
9
|
+
- 0
|
10
|
+
version: 0.3.0
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- rattle
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2010-12-
|
18
|
+
date: 2010-12-24 00:00:00 +00:00
|
19
19
|
default_executable:
|
20
20
|
dependencies:
|
21
21
|
- !ruby/object:Gem::Dependency
|