clause_extractor 0.1.2 → 0.1.3
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/clause_extractor.rb +25 -100
- metadata +2 -2
data/lib/clause_extractor.rb
CHANGED
@@ -1,89 +1,8 @@
|
|
1
1
|
class ClauseExtractor
|
2
2
|
require "conjugations"
|
3
|
-
|
4
|
-
pronouns = "(i|you|he|she|it|they|we|there)"
|
5
|
-
present_perfect = "(already|ever|for|just|never|since|yet)"
|
6
|
-
have_has = "(have|has|haven't|hasn't)"
|
7
|
-
was_were = "(were|was|wasn't|weren't)"
|
8
|
-
had = "([a-z]{1,4}'d|had)(n't)*"
|
9
|
-
have_has = "(have|has|haven't|hasn't|havent|hasnt|has not|have not)"
|
10
|
-
contractions = "(it'*s|he'*s|she'*s|[a-z]{1,4}'*ve)"
|
11
|
-
to_be = "(am|are|'m|'re|'s|is|[a-z]{1,4}'re)"
|
12
|
-
will = "(will|[a-z]{1,4}'ll)"
|
13
|
-
would = "(would|[a-z]{1,4}'d)"
|
14
|
-
|
15
|
-
@tense_regexes = {
|
16
|
-
|
17
|
-
'third' => {
|
18
|
-
"simple present" => [
|
19
|
-
/\b(he|she|it)\s+search(s)?\b/i, #he arrives
|
20
|
-
/\bsearch(s)?\s+(it|them|him|her|me|you|us)\b/i #adapts it
|
21
|
-
]
|
22
|
-
},
|
23
|
-
'infinitive' => {
|
24
|
-
"simple present" => [/\b((I|you|they|we|to)\s+)*+search\b/i],#to arrive
|
25
|
-
|
26
|
-
|
27
|
-
"subjunctive future" => [
|
28
|
-
/\bif\s+#{pronouns}\s+#{was_were}\s+(not\s+)*to\s+(not\s+)*search/i, #if I were to arise
|
29
|
-
/\bif\s+#{pronouns}\s+should(n't)*\s+(not\s+)*search/i #If I should arise
|
30
|
-
],
|
31
|
-
"subjunctive present" => [ /\bthat\s+#{pronouns}\s+(not\s+)*search/i], #that we arrive
|
32
|
-
|
33
|
-
"conditional simple" => [ /\b(#{pronouns}\s+)*(would(n't)*|[a-z]{1,4}'d)(\s+not)*\s+search/i], #I would arise, I wouldn't arise
|
34
|
-
|
35
|
-
"will-future" => [ /\b(#{pronouns}\s+)*(will|[a-z]{1,4}'ll)(\s+not)*\s+search/i], #I'll arise
|
36
|
-
|
37
|
-
"going to-future" => [ /\b(#{pronouns}\s+)*#{to_be}\s+(not\s+)*going\s+to\s+search/i], #they are going to cry
|
38
|
-
},
|
39
|
-
'gerund' => {
|
40
|
-
"conditional perfect progressive" => [ /\b(#{pronouns}\s+)*would\s+(not\s+)*have\s+(not\s+)*been\s+search/i], #I would have been searching
|
41
|
-
"present perfect progressive" => [
|
42
|
-
/\b(#{have_has}\s+)(#{pronouns}\s+)*(not\s+)*(#{present_perfect}\s+)*been\s+search/i, #have they not been searching
|
43
|
-
/\b(#{pronouns}\s+)*#{have_has}*\s+(not\s+)*(#{present_perfect}\s+)*been\s+search/i #I have been searching
|
44
|
-
],
|
45
|
-
"past perfect progressive" => [
|
46
|
-
/\b(#{pronouns}\s+)*#{had}\s(not\s+)*(#{present_perfect}\s+)*been\s+search/i, #I had been searching,
|
47
|
-
/\bhad(n't)*\s+(#{pronouns}\s+)*(not\s+)*(#{present_perfect}\s+)*been\s+search/i, #had he not been searching
|
48
|
-
|
49
|
-
],
|
50
|
-
|
51
|
-
"conditional progressive" => [/\b(#{pronouns}\s+)*#{would}\s+(not\s+)*be\s+search/i], #I would be searching (I'd)
|
52
|
-
"future progressive" => [
|
53
|
-
/\b((#{pronouns})\s+)*#{will}\s+(not\s+)*be\s+search/i,
|
54
|
-
/\bwill\s+(#{pronouns}\s+)(not\s+)*be\s+search/i,
|
55
|
-
], #I will be searching
|
56
|
-
"past progressive" => [/\b(#{pronouns}\s+)*#{was_were}*\s+(not\s+)*search/i], #I was searching
|
57
|
-
|
58
|
-
"present progressive" => [/\b(#{pronouns}\s*)*(#{to_be}\s+)*(not\s+)*search/i], #I'm rising
|
59
|
-
},
|
60
|
-
"past-participle" => {
|
61
|
-
"conditional perfect" => [/\b(#{pronouns}\s+)*#{would}\s+(not\s+)*have\s+(not\s+)*search/i], #I would not search
|
62
|
-
"future perfect" => [/\b(#{pronouns}\s+)*#{will}\s+have\s+search/i], #I'll have arisen
|
63
|
-
"past perfect" => [
|
64
|
-
/\b(#{pronouns}\s+)*#{had}\s+(not\s+)*((#{present_perfect})\s+)*search/i, #I had arisen
|
65
|
-
/\b#{had}\s+(#{pronouns}\s+)*(not\s+)*((#{present_perfect})\s+)*search/i
|
66
|
-
],
|
67
|
-
"present perfect" => [
|
68
|
-
/\b(#{pronouns}\s+)*#{have_has}\s+((#{present_perfect})\s+)*search/, #They have already seen
|
69
|
-
/\b#{have_has}\s+(#{pronouns}\s+)*(not\s+)*(#{present_perfect}\s+)*search/ #Have they already seen
|
70
|
-
],
|
71
|
-
"subjunctive past" => [/\bif\s+#{pronouns}\s+search/i], #if I arose
|
72
|
-
"simple past" => [/\b#{pronouns}\s+search/i] #you chose
|
73
|
-
},
|
74
|
-
#"present perfect" => [/^\s*search\b/i], #arisen
|
75
|
-
#"simple past" => [/^\s*search\b/i] #arose
|
76
|
-
}
|
77
|
-
def self.get_match_start_index(verb, match, index)
|
78
|
-
#get start position of last occurence of verb in match
|
79
|
-
verb_index_in_match = match.index /#{verb}(?!.*#{verb})/i
|
80
|
-
#count spaces between match start and verb_index_in_match and subtract that from index
|
81
|
-
lo = index - match[0,verb_index_in_match].split(/\s+/).size
|
82
|
-
hi = lo + match[0,verb_index_in_match].split(/\s+/).size
|
83
|
-
return lo, hi
|
84
|
-
end
|
3
|
+
require "matchers"
|
85
4
|
|
86
|
-
def self.get_clauses(phrase, format = String.new
|
5
|
+
def self.get_clauses(phrase, format = String.new)
|
87
6
|
@format = format
|
88
7
|
phrase = phrase.downcase
|
89
8
|
#list = format.match("audioverb") ? Hash.new : Array.new
|
@@ -94,32 +13,38 @@ class ClauseExtractor
|
|
94
13
|
@tense_id ||= get_tenses
|
95
14
|
@con_id ||= get_con_id
|
96
15
|
ranges = []
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
@
|
103
|
-
|
16
|
+
|
17
|
+
phrase.gsub!(/[!.?\(\)]/,"") if phrase
|
18
|
+
phrase_a = phrase.split(/\s+/)
|
19
|
+
phrase_a.length.times do |i|
|
20
|
+
# phrase_a[i].gsub!(/[!.?\(\)]/,"") if phrase_a[i] #remove any punctuation from the word
|
21
|
+
if @con_id[phrase_a[i]] then #if word matches a conjugation
|
22
|
+
$tense_regexes.each do |k,v|
|
23
|
+
if k.match(/#{@id_tiempo[@tiempos[phrase_a[i]]]}/)
|
104
24
|
v.each do |tense, regex_array|
|
105
25
|
regex_array.each do |regex|
|
106
|
-
regex = regex.to_s.gsub("search", "#{
|
107
|
-
phrase, list, ranges = scan_phrase(phrase, list, regex,
|
26
|
+
regex = regex.to_s.gsub("search", "#{phrase_a[i]}")
|
27
|
+
phrase, list, ranges = scan_phrase(phrase, list, regex, phrase_a[i], tense, i, ranges)
|
108
28
|
end
|
109
29
|
end
|
110
30
|
end
|
111
31
|
end
|
112
32
|
end
|
113
|
-
end
|
114
|
-
list.each
|
115
|
-
|
116
|
-
end
|
117
|
-
list.each do |k,v|
|
118
|
-
print "#{k}\n"
|
119
|
-
end
|
33
|
+
end
|
34
|
+
list.each { |k, v| list.delete(k) unless ranges.include?(v) }
|
35
|
+
list.each { |k, v| print "#{k}\n" }
|
120
36
|
list
|
121
37
|
end
|
122
38
|
|
39
|
+
def self.get_match_start_index(verb, match, index)
|
40
|
+
#get start position of last occurence of verb in match
|
41
|
+
verb_index_in_match = match.index /#{verb}(?!.*#{verb})/i
|
42
|
+
#count spaces between match start and verb_index_in_match and subtract that from index
|
43
|
+
lo = index - match[0,verb_index_in_match].split(/\s+/).size
|
44
|
+
hi = lo + match[0,verb_index_in_match].split(/\s+/).size
|
45
|
+
return lo, hi
|
46
|
+
end
|
47
|
+
|
123
48
|
def self.scan_phrase(phrase, list, regex, verb, tense_label, index, ranges)
|
124
49
|
if match = phrase.match(/#{regex}/i)
|
125
50
|
match = match.to_s
|
@@ -128,7 +53,7 @@ class ClauseExtractor
|
|
128
53
|
if @format.match(/audioverb/)
|
129
54
|
list[@tense_id["#{tense_label}"].to_s+":" + match.to_s + ":" + @verbs[verb].to_s] = (lo..hi)
|
130
55
|
else
|
131
|
-
list["#{tense_label}:" + match.to_s + ":" + (lo..hi).to_s] = (lo..hi)
|
56
|
+
list["#{tense_label}:" + match.to_s + ":" + (lo..hi).to_s] = (lo..hi)
|
132
57
|
end
|
133
58
|
end
|
134
59
|
return phrase, list, ranges
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: clause_extractor
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.3
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-
|
12
|
+
date: 2012-10-06 00:00:00.000000000 Z
|
13
13
|
dependencies: []
|
14
14
|
description: English verbal clause extractor
|
15
15
|
email: mikefabrikant@gmail.com
|