clause_extractor 0.0.5 → 0.0.6
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/clause_extractor.rb +152 -82
- data/lib/conjugations.rb +5 -4
- metadata +2 -2
data/lib/clause_extractor.rb
CHANGED
@@ -1,88 +1,158 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
"
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
1
|
+
class ClauseExtractor
|
2
|
+
require "conjugations"
|
3
|
+
|
4
|
+
pronouns = "(i|you|he|she|it|they|we|there)"
|
5
|
+
present_perfect = "(already|ever|for|just|never|since|yet)"
|
6
|
+
have_has = "(have|has|haven't|hasn't)"
|
7
|
+
contractions = "it's|he's|she's|[a-z]{1,4}'ve"
|
8
|
+
|
9
|
+
|
10
|
+
|
11
|
+
@tense_regexes = {
|
12
|
+
|
13
|
+
'third' => {
|
14
|
+
"simple present"
|
15
|
+
=> [
|
16
|
+
/\b(he|she|it)\s+search(s)?\b/i, #he arrives
|
17
|
+
/\bsearch(s)?\s+(it|them|him|her|me|you|us)\b/i #adapts it
|
18
|
+
]
|
19
|
+
},
|
20
|
+
'infinitive' => {
|
21
|
+
"simple present" => [
|
22
|
+
/\b((I|you|they|we|to)\s+)*+search\b/i, #arrive
|
23
|
+
],
|
24
|
+
|
25
|
+
"subjunctive future" => [
|
26
|
+
/\bif\s+#{pronouns}\s+were\s+(not\s+)*to\s+(not\s+)*search/i, #if I were to arise
|
27
|
+
/\bif\s+#{pronouns}\s+should\s+(not\s+)*search/i #If I should arise
|
28
|
+
],
|
29
|
+
"subjunctive present" => [ /\bthat\s+#{pronouns}\s+(not\s+)*search/i], #that we arrive
|
30
|
+
|
31
|
+
"conditional simple" => [ /\b(#{pronouns}\s+)*(would|[a-z]{1,4}'d)(\s+not)*\s+search/i], #I would arise
|
32
|
+
|
33
|
+
"will-future" => [ /\b(#{pronouns}\s+)*(will|[a-z]{1,4}'ll)(\s+not)*\s+search/i], #I'll arise
|
34
|
+
|
35
|
+
"going to-future" => [ /\b(#{pronouns}\s+)*(am|are|i'm|[a-z]{1,4}'re|[a-z]{1,4}'s)\s+(not\s+)*going\s+to\s+search/i], #they are going to cry
|
36
|
+
},
|
37
|
+
'gerund' => {
|
38
|
+
"conditional perfect progressive" => [/\b(#{pronouns}\s+)*would\s+(not\s+)*have\s+(not\s+)*been\s+search/i], #I would have been searching
|
39
|
+
"present perfect progressive" => [/\b(#{pronouns}\s+)*([a-z]{1,4}'ve|have|has)(n't)*\s+(#{pronouns}\s+)*(not\s+)*(#{present_perfect}\s+)*been\s+search/i], #I have been searching
|
40
|
+
"conditional progressive" => [/\b(#{pronouns}\s+)*(would|[a-z]{1,4}'d)\s+(not\s+)*be\s+search/i], #I would be searching (I'd)
|
41
|
+
"future progressive" => [
|
42
|
+
/\b((#{pronouns})\s+)*(will|[a-z]{1,4}'ll)\s+(not\s+)*be\s+search/i,
|
43
|
+
/\bwill\s+(#{pronouns}\s+)(not\s+)*be\s+search/i,
|
44
|
+
], #I will be searching
|
45
|
+
"past progressive" => [/\b(#{pronouns}\s+)*(was|were)(n't)*\s+(not\s+)*search/i], #I was searching
|
46
|
+
|
47
|
+
"present progressive" => [/\b(#{pronouns}\s+)*((am|are|is|i'm|\b[a-z]{1,4}'re|\b[a-z]{1,4}'s)\s+)*(not\s+)*search/i], #I'm rising
|
48
|
+
|
49
|
+
},
|
50
|
+
"past-participle" => {
|
51
|
+
"conditional perfect" => [/\b(#{pronouns}\s+)*(would|[a-z]{1,4}'d)\s+(not\s+)*have\s+(not\s+)*search/i], #I would not search
|
52
|
+
"future perfect" => [/\b(#{pronouns}\s+)*(will|[a-z]{1,4}'ll)\s+have\s+search/i], #I'll have arisen
|
53
|
+
"past perfect" => [/\b(#{pronouns}\s+)*(had|[a-z]{1,4}'d)\s+(#{pronouns}\s+)*(not\s+)*((#{present_perfect})\s+)*search/i], #I had arisen
|
54
|
+
"present perfect" => [/\b(#{pronouns}\s+)*#{have_has}\s+(#{pronouns}\s+)*(not\s+)*((just|already|ever)\s+)*search/], #Have you seen
|
55
|
+
"subjunctive past" => [/\bif\s+(i|you|he|she|it|they|we)\s+search/i], #if I arose
|
56
|
+
"simple past" => [/\b#{pronouns}\s+search/i] #you chose
|
57
|
+
},
|
58
|
+
|
59
|
+
|
46
60
|
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
61
|
+
|
62
|
+
|
63
|
+
|
64
|
+
|
65
|
+
# # "present perfect" => [/^\s*search\b/i], #arisen
|
66
|
+
# # "simple past" => [/^\s*search\b/i] #arose
|
67
|
+
}
|
68
|
+
|
69
|
+
|
70
|
+
|
71
|
+
def self.get_match_start_index(verb, match, index)
|
72
|
+
#get start position of last occurence of verb in match
|
73
|
+
verb_index_in_match = match.index /#{verb}(?!.*#{verb})/i
|
74
|
+
#count spaces between match start and verb_index_in_match and subtract that from index
|
75
|
+
lo = index - match[0,verb_index_in_match].split(/\s+/).size
|
76
|
+
hi = lo + match[0,verb_index_in_match].split(/\s+/).size
|
77
|
+
return lo, hi
|
78
|
+
end
|
79
|
+
|
80
|
+
def self.get_clauses(phrase, format = String.new, verbs=nil, tiempo=nil, id_tiempo=nil, tense_id=nil, con_id=nil)
|
81
|
+
@format = format
|
82
|
+
phrase = phrase.downcase
|
83
|
+
#list = format.match("audioverb") ? Hash.new : Array.new
|
84
|
+
list = Hash.new
|
85
|
+
@verbs ||= get_verbs
|
86
|
+
@tiempos ||= get_tiempos
|
87
|
+
@id_tiempo ||= get_id_tiempos
|
88
|
+
@tense_id ||= get_tenses
|
89
|
+
@con_id ||= get_con_id
|
90
|
+
ranges = []
|
91
|
+
a=Array.new
|
92
|
+
a = phrase.split(/\s+/)
|
93
|
+
a.length.times do |i|
|
94
|
+
a[i].gsub!(/[!.?\(\)]/,"") if a[i] #remove any punctuation from the word
|
95
|
+
if @con_id[a[i]] then #if word matches a conjugation
|
96
|
+
@tense_regexes.each do |k,v|
|
97
|
+
if k.match(/#{@id_tiempo[@tiempos[a[i]]]}/)
|
98
|
+
v.each do |tense, regex_array|
|
99
|
+
regex_array.each do |regex|
|
100
|
+
regex = regex.to_s.gsub("search", "#{a[i]}")
|
101
|
+
phrase, list, ranges = scan_phrase(phrase, list, regex, a[i], tense, i, ranges)
|
102
|
+
|
103
|
+
end
|
81
104
|
end
|
82
105
|
end
|
83
|
-
end
|
84
|
-
end#end
|
85
|
-
|
106
|
+
end
|
107
|
+
end #end if is conjugation
|
108
|
+
end#end of looping through each cap
|
109
|
+
list.each do |k, v|
|
110
|
+
list.delete(k) unless ranges.include?(v)
|
86
111
|
end
|
112
|
+
print "#{list}\n"
|
113
|
+
list
|
87
114
|
end
|
115
|
+
|
116
|
+
def self.scan_phrase(phrase, list, regex, verb, tense_label, index, ranges)
|
117
|
+
if match = phrase.match(/#{regex}/i)
|
118
|
+
match = match.to_s
|
119
|
+
lo, hi = get_match_start_index(verb, match, index)
|
120
|
+
ranges = prioritize_ranges(ranges, lo, hi,match)
|
121
|
+
list[@tense_id["#{tense_label}"].to_s+":" + match.to_s + ":" + @verbs[verb].to_s] = (lo..hi) if @format.match(/audioverb/)
|
122
|
+
list["#{tense_label} :" + match.to_s + ":" + @verbs[verb].to_s] = (lo..hi) unless @format.match(/audioverb/)
|
123
|
+
end
|
124
|
+
return phrase, list, ranges
|
125
|
+
end
|
126
|
+
|
127
|
+
def self.prioritize_ranges(ranges, lo, hi,match)
|
128
|
+
range = (lo..hi)
|
129
|
+
|
130
|
+
ranges.size.times.each do |r|
|
131
|
+
#replace old range with new one if start is same point and new range is longer
|
132
|
+
if ranges[r].begin == lo and ranges[r].count < range.count
|
133
|
+
ranges[r] = range
|
134
|
+
elsif (range.include?(ranges[r].begin) || range.include?(ranges[r].end)) && range.count > ranges[r].count
|
135
|
+
ranges.delete_at(r)
|
136
|
+
end
|
137
|
+
end
|
138
|
+
#add range to ranges if it is not already included in an existing range
|
139
|
+
if ranges.each.select{|r| r.include?(lo) || r.include?(hi)}.size == 0
|
140
|
+
ranges << range
|
141
|
+
end
|
142
|
+
|
143
|
+
ranges
|
144
|
+
end
|
88
145
|
end
|
146
|
+
|
147
|
+
# ####For generating conjugations.rb content
|
148
|
+
# @conjugations = get_conjugations
|
149
|
+
# @conjugations.each do |k,v|
|
150
|
+
# @con = v['con']
|
151
|
+
# @con_id[@con] = k #id
|
152
|
+
# #print "'#{@con}' => #{k},\n"
|
153
|
+
# #print "'#{@con}' => #{v['verb_id']},\n"
|
154
|
+
# #print "'#{@con}' => #{v['tiempo_id']},\n"
|
155
|
+
# #@tiempos[@con] = v['tiempo_id'] #tiempo_id
|
156
|
+
# #@verbs[@con] = v['verb_id'] #verb_id
|
157
|
+
# end
|
158
|
+
|
data/lib/conjugations.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: clause_extractor
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.6
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-08
|
12
|
+
date: 2012-09-08 00:00:00.000000000 Z
|
13
13
|
dependencies: []
|
14
14
|
description: A simple hello world gem
|
15
15
|
email: mikefabrikant@gmail.com
|