clause_extractor 0.1.2 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/clause_extractor.rb +25 -100
- metadata +2 -2
data/lib/clause_extractor.rb
CHANGED
@@ -1,89 +1,8 @@
|
|
1
1
|
class ClauseExtractor
|
2
2
|
require "conjugations"
|
3
|
-
|
4
|
-
pronouns = "(i|you|he|she|it|they|we|there)"
|
5
|
-
present_perfect = "(already|ever|for|just|never|since|yet)"
|
6
|
-
have_has = "(have|has|haven't|hasn't)"
|
7
|
-
was_were = "(were|was|wasn't|weren't)"
|
8
|
-
had = "([a-z]{1,4}'d|had)(n't)*"
|
9
|
-
have_has = "(have|has|haven't|hasn't|havent|hasnt|has not|have not)"
|
10
|
-
contractions = "(it'*s|he'*s|she'*s|[a-z]{1,4}'*ve)"
|
11
|
-
to_be = "(am|are|'m|'re|'s|is|[a-z]{1,4}'re)"
|
12
|
-
will = "(will|[a-z]{1,4}'ll)"
|
13
|
-
would = "(would|[a-z]{1,4}'d)"
|
14
|
-
|
15
|
-
@tense_regexes = {
|
16
|
-
|
17
|
-
'third' => {
|
18
|
-
"simple present" => [
|
19
|
-
/\b(he|she|it)\s+search(s)?\b/i, #he arrives
|
20
|
-
/\bsearch(s)?\s+(it|them|him|her|me|you|us)\b/i #adapts it
|
21
|
-
]
|
22
|
-
},
|
23
|
-
'infinitive' => {
|
24
|
-
"simple present" => [/\b((I|you|they|we|to)\s+)*+search\b/i],#to arrive
|
25
|
-
|
26
|
-
|
27
|
-
"subjunctive future" => [
|
28
|
-
/\bif\s+#{pronouns}\s+#{was_were}\s+(not\s+)*to\s+(not\s+)*search/i, #if I were to arise
|
29
|
-
/\bif\s+#{pronouns}\s+should(n't)*\s+(not\s+)*search/i #If I should arise
|
30
|
-
],
|
31
|
-
"subjunctive present" => [ /\bthat\s+#{pronouns}\s+(not\s+)*search/i], #that we arrive
|
32
|
-
|
33
|
-
"conditional simple" => [ /\b(#{pronouns}\s+)*(would(n't)*|[a-z]{1,4}'d)(\s+not)*\s+search/i], #I would arise, I wouldn't arise
|
34
|
-
|
35
|
-
"will-future" => [ /\b(#{pronouns}\s+)*(will|[a-z]{1,4}'ll)(\s+not)*\s+search/i], #I'll arise
|
36
|
-
|
37
|
-
"going to-future" => [ /\b(#{pronouns}\s+)*#{to_be}\s+(not\s+)*going\s+to\s+search/i], #they are going to cry
|
38
|
-
},
|
39
|
-
'gerund' => {
|
40
|
-
"conditional perfect progressive" => [ /\b(#{pronouns}\s+)*would\s+(not\s+)*have\s+(not\s+)*been\s+search/i], #I would have been searching
|
41
|
-
"present perfect progressive" => [
|
42
|
-
/\b(#{have_has}\s+)(#{pronouns}\s+)*(not\s+)*(#{present_perfect}\s+)*been\s+search/i, #have they not been searching
|
43
|
-
/\b(#{pronouns}\s+)*#{have_has}*\s+(not\s+)*(#{present_perfect}\s+)*been\s+search/i #I have been searching
|
44
|
-
],
|
45
|
-
"past perfect progressive" => [
|
46
|
-
/\b(#{pronouns}\s+)*#{had}\s(not\s+)*(#{present_perfect}\s+)*been\s+search/i, #I had been searching,
|
47
|
-
/\bhad(n't)*\s+(#{pronouns}\s+)*(not\s+)*(#{present_perfect}\s+)*been\s+search/i, #had he not been searching
|
48
|
-
|
49
|
-
],
|
50
|
-
|
51
|
-
"conditional progressive" => [/\b(#{pronouns}\s+)*#{would}\s+(not\s+)*be\s+search/i], #I would be searching (I'd)
|
52
|
-
"future progressive" => [
|
53
|
-
/\b((#{pronouns})\s+)*#{will}\s+(not\s+)*be\s+search/i,
|
54
|
-
/\bwill\s+(#{pronouns}\s+)(not\s+)*be\s+search/i,
|
55
|
-
], #I will be searching
|
56
|
-
"past progressive" => [/\b(#{pronouns}\s+)*#{was_were}*\s+(not\s+)*search/i], #I was searching
|
57
|
-
|
58
|
-
"present progressive" => [/\b(#{pronouns}\s*)*(#{to_be}\s+)*(not\s+)*search/i], #I'm rising
|
59
|
-
},
|
60
|
-
"past-participle" => {
|
61
|
-
"conditional perfect" => [/\b(#{pronouns}\s+)*#{would}\s+(not\s+)*have\s+(not\s+)*search/i], #I would not search
|
62
|
-
"future perfect" => [/\b(#{pronouns}\s+)*#{will}\s+have\s+search/i], #I'll have arisen
|
63
|
-
"past perfect" => [
|
64
|
-
/\b(#{pronouns}\s+)*#{had}\s+(not\s+)*((#{present_perfect})\s+)*search/i, #I had arisen
|
65
|
-
/\b#{had}\s+(#{pronouns}\s+)*(not\s+)*((#{present_perfect})\s+)*search/i
|
66
|
-
],
|
67
|
-
"present perfect" => [
|
68
|
-
/\b(#{pronouns}\s+)*#{have_has}\s+((#{present_perfect})\s+)*search/, #They have already seen
|
69
|
-
/\b#{have_has}\s+(#{pronouns}\s+)*(not\s+)*(#{present_perfect}\s+)*search/ #Have they already seen
|
70
|
-
],
|
71
|
-
"subjunctive past" => [/\bif\s+#{pronouns}\s+search/i], #if I arose
|
72
|
-
"simple past" => [/\b#{pronouns}\s+search/i] #you chose
|
73
|
-
},
|
74
|
-
#"present perfect" => [/^\s*search\b/i], #arisen
|
75
|
-
#"simple past" => [/^\s*search\b/i] #arose
|
76
|
-
}
|
77
|
-
def self.get_match_start_index(verb, match, index)
|
78
|
-
#get start position of last occurence of verb in match
|
79
|
-
verb_index_in_match = match.index /#{verb}(?!.*#{verb})/i
|
80
|
-
#count spaces between match start and verb_index_in_match and subtract that from index
|
81
|
-
lo = index - match[0,verb_index_in_match].split(/\s+/).size
|
82
|
-
hi = lo + match[0,verb_index_in_match].split(/\s+/).size
|
83
|
-
return lo, hi
|
84
|
-
end
|
3
|
+
require "matchers"
|
85
4
|
|
86
|
-
def self.get_clauses(phrase, format = String.new
|
5
|
+
def self.get_clauses(phrase, format = String.new)
|
87
6
|
@format = format
|
88
7
|
phrase = phrase.downcase
|
89
8
|
#list = format.match("audioverb") ? Hash.new : Array.new
|
@@ -94,32 +13,38 @@ class ClauseExtractor
|
|
94
13
|
@tense_id ||= get_tenses
|
95
14
|
@con_id ||= get_con_id
|
96
15
|
ranges = []
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
@
|
103
|
-
|
16
|
+
|
17
|
+
phrase.gsub!(/[!.?\(\)]/,"") if phrase
|
18
|
+
phrase_a = phrase.split(/\s+/)
|
19
|
+
phrase_a.length.times do |i|
|
20
|
+
# phrase_a[i].gsub!(/[!.?\(\)]/,"") if phrase_a[i] #remove any punctuation from the word
|
21
|
+
if @con_id[phrase_a[i]] then #if word matches a conjugation
|
22
|
+
$tense_regexes.each do |k,v|
|
23
|
+
if k.match(/#{@id_tiempo[@tiempos[phrase_a[i]]]}/)
|
104
24
|
v.each do |tense, regex_array|
|
105
25
|
regex_array.each do |regex|
|
106
|
-
regex = regex.to_s.gsub("search", "#{
|
107
|
-
phrase, list, ranges = scan_phrase(phrase, list, regex,
|
26
|
+
regex = regex.to_s.gsub("search", "#{phrase_a[i]}")
|
27
|
+
phrase, list, ranges = scan_phrase(phrase, list, regex, phrase_a[i], tense, i, ranges)
|
108
28
|
end
|
109
29
|
end
|
110
30
|
end
|
111
31
|
end
|
112
32
|
end
|
113
|
-
end
|
114
|
-
list.each
|
115
|
-
|
116
|
-
end
|
117
|
-
list.each do |k,v|
|
118
|
-
print "#{k}\n"
|
119
|
-
end
|
33
|
+
end
|
34
|
+
list.each { |k, v| list.delete(k) unless ranges.include?(v) }
|
35
|
+
list.each { |k, v| print "#{k}\n" }
|
120
36
|
list
|
121
37
|
end
|
122
38
|
|
39
|
+
def self.get_match_start_index(verb, match, index)
|
40
|
+
#get start position of last occurence of verb in match
|
41
|
+
verb_index_in_match = match.index /#{verb}(?!.*#{verb})/i
|
42
|
+
#count spaces between match start and verb_index_in_match and subtract that from index
|
43
|
+
lo = index - match[0,verb_index_in_match].split(/\s+/).size
|
44
|
+
hi = lo + match[0,verb_index_in_match].split(/\s+/).size
|
45
|
+
return lo, hi
|
46
|
+
end
|
47
|
+
|
123
48
|
def self.scan_phrase(phrase, list, regex, verb, tense_label, index, ranges)
|
124
49
|
if match = phrase.match(/#{regex}/i)
|
125
50
|
match = match.to_s
|
@@ -128,7 +53,7 @@ class ClauseExtractor
|
|
128
53
|
if @format.match(/audioverb/)
|
129
54
|
list[@tense_id["#{tense_label}"].to_s+":" + match.to_s + ":" + @verbs[verb].to_s] = (lo..hi)
|
130
55
|
else
|
131
|
-
list["#{tense_label}:" + match.to_s + ":" + (lo..hi).to_s] = (lo..hi)
|
56
|
+
list["#{tense_label}:" + match.to_s + ":" + (lo..hi).to_s] = (lo..hi)
|
132
57
|
end
|
133
58
|
end
|
134
59
|
return phrase, list, ranges
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: clause_extractor
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.3
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-
|
12
|
+
date: 2012-10-06 00:00:00.000000000 Z
|
13
13
|
dependencies: []
|
14
14
|
description: English verbal clause extractor
|
15
15
|
email: mikefabrikant@gmail.com
|