term-extract 0.2.1 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/VERSION +1 -1
- data/lib/term-extract.rb +27 -14
- data/term-extract.gemspec +2 -2
- metadata +5 -5
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.3.0
|
data/lib/term-extract.rb
CHANGED
@@ -32,22 +32,13 @@ class TermExtract
|
|
32
32
|
|
33
33
|
def extract(content)
|
34
34
|
|
35
|
-
tagger = @@TAGGER.nil? ? Brill::Tagger.new : @@TAGGER
|
36
|
-
|
37
35
|
# Tidy content punctuation
|
38
36
|
# Add a space after periods
|
39
37
|
content.gsub!(/([A-Za-z0-9])\./, '\1. ')
|
40
|
-
|
41
|
-
tags
|
42
|
-
tagger.
|
43
|
-
|
44
|
-
tag[0].chop!
|
45
|
-
tags.push tag
|
46
|
-
tags.push ['.', '.']
|
47
|
-
else
|
48
|
-
tags.push tag
|
49
|
-
end
|
50
|
-
end
|
38
|
+
|
39
|
+
# Assign POS tags and tidy tag stack
|
40
|
+
tagger = @@TAGGER.nil? ? Brill::Tagger.new : @@TAGGER
|
41
|
+
tags = preprocess_tags(tagger.tag(content))
|
51
42
|
|
52
43
|
# Set pos tags that identify nouns
|
53
44
|
pos = "^NN"
|
@@ -79,7 +70,6 @@ class TermExtract
|
|
79
70
|
multiterm << [term,tag]
|
80
71
|
elsif state == @@NOUN and last_tag =~ /^(NNP|NNPS)$/ and tag == 'IN' and term =~ /(of|for|on|of\sthe|\&|d\'|du|de)/i
|
81
72
|
# Allow preposition : "Secretary of State"
|
82
|
-
# Doesn't support "Chair of the Parades Commission"
|
83
73
|
# Only use when in NNP mode
|
84
74
|
multiterm << [term,tag]
|
85
75
|
elsif state == @@NOUN and tag =~ /#{pos}/
|
@@ -115,6 +105,29 @@ class TermExtract
|
|
115
105
|
end
|
116
106
|
|
117
107
|
protected
|
108
|
+
def preprocess_tags(pos)
|
109
|
+
# Add in full stops to tag list to allow multiterms to work
|
110
|
+
tags = []
|
111
|
+
pos.each do |tag|
|
112
|
+
if tag[0] =~ /\.$/
|
113
|
+
tag[0].chop!
|
114
|
+
tags.push tag
|
115
|
+
tags.push ['.', '.']
|
116
|
+
else
|
117
|
+
tags.push tag
|
118
|
+
end
|
119
|
+
end
|
120
|
+
# Join certain prepositions together to allow them to be extracted
|
121
|
+
# e.g. allows 'News of the World' to be extracted
|
122
|
+
tags.each_with_index do |tag, index|
|
123
|
+
if tag[0] == 'of' && (index + 1) < tags.length && tags[index+1][0] == 'the'
|
124
|
+
tags[index][0] = 'of the'
|
125
|
+
tags.delete_at(index+1)
|
126
|
+
end
|
127
|
+
end
|
128
|
+
tags
|
129
|
+
end
|
130
|
+
|
118
131
|
def add_term(term, tag, multiterm, terms)
|
119
132
|
multiterm << ([term, tag])
|
120
133
|
increment_term(term, tag, terms)
|
data/term-extract.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{term-extract}
|
8
|
-
s.version = "0.
|
8
|
+
s.version = "0.3.0"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["rattle"]
|
12
|
-
s.date = %q{2010-12-
|
12
|
+
s.date = %q{2010-12-24}
|
13
13
|
s.email = %q{robl@rjlee.net}
|
14
14
|
s.extra_rdoc_files = [
|
15
15
|
"LICENSE.txt",
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: term-extract
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 19
|
5
5
|
prerelease: false
|
6
6
|
segments:
|
7
7
|
- 0
|
8
|
-
-
|
9
|
-
-
|
10
|
-
version: 0.
|
8
|
+
- 3
|
9
|
+
- 0
|
10
|
+
version: 0.3.0
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- rattle
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2010-12-
|
18
|
+
date: 2010-12-24 00:00:00 +00:00
|
19
19
|
default_executable:
|
20
20
|
dependencies:
|
21
21
|
- !ruby/object:Gem::Dependency
|