clause_extractor 0.0.5 → 0.0.6

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,88 +1,158 @@
1
- module ClauseExtractor
2
- class Clause
3
- require "./conjugations2"
4
-
5
- @tense_regexes = {
6
- "present perfect" => [/\b(have|has|it's|he's|she's|[a-z]{1,4}'ve)\s+((i|you|he|she|it|they|we)\s+)*(not\s+)*((just|already)\s+)*search/i], #I have arisen/Have I not arisen
7
- "future progressive" => [/\b(will|[a-z]{1,4}'ll)\s+(not\s+)*be\s+search/i, /\b[a-z]{1,4}'ll\s+(not\s+)*be\s+search/i], #I will be searching
8
- "present perfect progressive" => [/\b([a-z]{1,4}'ve|have|has)(n't)*\s+(not\s+)*((just|already)\s+)*been\s+search/i], #I have been searching
9
- "subjunctive future" => [/\bif\s+(i|you|he|she|it|they|we)\s+were\s+(not\s+)*to\s+(not\s+)*search/i], #if I were to arise
10
- "going to-future" => [/\b(am|are|i'm|[a-z]{1,4}'re|[a-z]{1,4}'s)\s+(not\s+)*going\s+to\s+search/i], #they are going to cry
11
- "present progressive" => [/\b(am|are|is|i'm|\b[a-z]{1,4}'re|\b[a-z]{1,4}'s)\s+(not\s+)*search/i], #I'm rising
12
- "subjunctive present" => [/if\s+(i|you|he|she|it|they|we)\s+should\s+(not\s+)*search/i], #if I should arise
13
- "conditional perfect" => [/\b(would|[a-z]{1,4}'d)\s+(not\s+)*have\s+(not\s+)*search/i], #I would not search
14
- "past perfect" => [/\b(had|[a-z]{1,4}'d)\s+(not\s+)*(just\s+)*search/i], #I had arisen
15
- "subjunctive present" => [/\bthat\s+(i|you|he|she|they|we)\s+(not\s+)*search/i], #that we arrive
16
- "conditional perfect progressive" => [/would\s+(not\s+)*have\s+(not\s+)*been\s+search/i], #I would have been searching
17
- "conditional progressive" => [/\b(would|[a-z]{1,4}'d)\s+(not\s+)*be\s+search/i], #I would be searching (I'd)
18
- "subjunctive past" => [/\bif\s+(i|you|he|she|it|they|we)\s+search/i], #if I arose
19
- "conditional simple" => [/\b(would|[a-z]{1,4}'d)(\s+not)*\s+search/i], #I would arise
20
- "will-future" => [/\b(will|[a-z]{1,4}'ll)(\s+not)*\s+search/i], #I'll arise
21
- "past progressive" => [/\b(was|were)(n't)*\s+(not\s+)*search/i], #I was searching
22
- "future perfect" => [/\b(will|[a-z]{1,4}'ll)\s+have\s+search/i], #I'll have arisen
23
- "present perfect progressive" => [/\bhave\s+(not\s+)*been\s+search/i], #I have been searching
24
- "simple past" => [/\b(i|you|he|she|it|they)\s+search/i], #you chose
25
- "simple present" => [
26
- /\b(I|you|they|we|to)\s+search\b/i, #arrive
27
- /\b(he|she|it)\s+search(s)?\b/i, #he arrives
28
- /\bsearch(s)?\s+(it|them|him|her|me|you|us)\b/i #adapts it
29
- ],
30
- "present progressive" => [/^search\b/i], #searching
31
- "present perfect" => [/^search\b/i], #arisen
32
- "simple past" => [/^search\b/i] #arose
33
- }
34
-
35
- def self.scan_phrase(phrase, list, regex, a_i, tense_label, index, ranges)
36
- if match = phrase.match(/#{regex}/i)
37
- if ranges.each.select{|r| r.include?(index) || r.include?(index+match.to_s.split(/\s/).length)}.size == 0
38
- ranges << (index .. (index + (match.to_s.split(/\s/).length-1)))
39
- print "#{ranges} RRR #{match} #{tense_label} ... #{index.class} \n"
40
- list[@tense_id["#{tense_label}"].to_s+":" + match.to_s + ":" + @verbs[a_i].to_s]=1 if @format.match(/audioverb/)
41
- list << "#{tense_label}:#{match.to_s}" unless @format.match(/audioverb/)
42
- end
43
- end
44
- return phrase, list, ranges
45
- end
1
+ class ClauseExtractor
2
+ require "conjugations"
3
+
4
+ pronouns = "(i|you|he|she|it|they|we|there)"
5
+ present_perfect = "(already|ever|for|just|never|since|yet)"
6
+ have_has = "(have|has|haven't|hasn't)"
7
+ contractions = "it's|he's|she's|[a-z]{1,4}'ve"
8
+
9
+
10
+
11
+ @tense_regexes = {
12
+
13
+ 'third' => {
14
+ "simple present"
15
+ => [
16
+ /\b(he|she|it)\s+search(s)?\b/i, #he arrives
17
+ /\bsearch(s)?\s+(it|them|him|her|me|you|us)\b/i #adapts it
18
+ ]
19
+ },
20
+ 'infinitive' => {
21
+ "simple present" => [
22
+ /\b((I|you|they|we|to)\s+)*+search\b/i, #arrive
23
+ ],
24
+
25
+ "subjunctive future" => [
26
+ /\bif\s+#{pronouns}\s+were\s+(not\s+)*to\s+(not\s+)*search/i, #if I were to arise
27
+ /\bif\s+#{pronouns}\s+should\s+(not\s+)*search/i #If I should arise
28
+ ],
29
+ "subjunctive present" => [ /\bthat\s+#{pronouns}\s+(not\s+)*search/i], #that we arrive
30
+
31
+ "conditional simple" => [ /\b(#{pronouns}\s+)*(would|[a-z]{1,4}'d)(\s+not)*\s+search/i], #I would arise
32
+
33
+ "will-future" => [ /\b(#{pronouns}\s+)*(will|[a-z]{1,4}'ll)(\s+not)*\s+search/i], #I'll arise
34
+
35
+ "going to-future" => [ /\b(#{pronouns}\s+)*(am|are|i'm|[a-z]{1,4}'re|[a-z]{1,4}'s)\s+(not\s+)*going\s+to\s+search/i], #they are going to cry
36
+ },
37
+ 'gerund' => {
38
+ "conditional perfect progressive" => [/\b(#{pronouns}\s+)*would\s+(not\s+)*have\s+(not\s+)*been\s+search/i], #I would have been searching
39
+ "present perfect progressive" => [/\b(#{pronouns}\s+)*([a-z]{1,4}'ve|have|has)(n't)*\s+(#{pronouns}\s+)*(not\s+)*(#{present_perfect}\s+)*been\s+search/i], #I have been searching
40
+ "conditional progressive" => [/\b(#{pronouns}\s+)*(would|[a-z]{1,4}'d)\s+(not\s+)*be\s+search/i], #I would be searching (I'd)
41
+ "future progressive" => [
42
+ /\b((#{pronouns})\s+)*(will|[a-z]{1,4}'ll)\s+(not\s+)*be\s+search/i,
43
+ /\bwill\s+(#{pronouns}\s+)(not\s+)*be\s+search/i,
44
+ ], #I will be searching
45
+ "past progressive" => [/\b(#{pronouns}\s+)*(was|were)(n't)*\s+(not\s+)*search/i], #I was searching
46
+
47
+ "present progressive" => [/\b(#{pronouns}\s+)*((am|are|is|i'm|\b[a-z]{1,4}'re|\b[a-z]{1,4}'s)\s+)*(not\s+)*search/i], #I'm rising
48
+
49
+ },
50
+ "past-participle" => {
51
+ "conditional perfect" => [/\b(#{pronouns}\s+)*(would|[a-z]{1,4}'d)\s+(not\s+)*have\s+(not\s+)*search/i], #I would not search
52
+ "future perfect" => [/\b(#{pronouns}\s+)*(will|[a-z]{1,4}'ll)\s+have\s+search/i], #I'll have arisen
53
+ "past perfect" => [/\b(#{pronouns}\s+)*(had|[a-z]{1,4}'d)\s+(#{pronouns}\s+)*(not\s+)*((#{present_perfect})\s+)*search/i], #I had arisen
54
+ "present perfect" => [/\b(#{pronouns}\s+)*#{have_has}\s+(#{pronouns}\s+)*(not\s+)*((just|already|ever)\s+)*search/], #Have you seen
55
+ "subjunctive past" => [/\bif\s+(i|you|he|she|it|they|we)\s+search/i], #if I arose
56
+ "simple past" => [/\b#{pronouns}\s+search/i] #you chose
57
+ },
58
+
59
+
46
60
 
47
- def self.get_clauses(phrase, format = String.new, verbs=nil, tiempo=nil, id_tiempo=nil, tense_id=nil, con_id=nil)
48
- @format = format
49
- phrase = phrase.downcase
50
- list = format.match("audioverb") ? Hash.new : Array.new
51
- @verbs ||= get_verbs
52
- @tiempos ||= get_tiempos
53
- @id_tiempo ||= get_id_tiempos
54
- @tense_id ||= get_tenses
55
- @con_id ||= get_con_id
56
- ranges = []
57
-
58
-
59
- # ####For generating conjugations.rb content
60
- # @conjugations = get_conjugations
61
- # @conjugations.each do |k,v|
62
- # @con = v['con']
63
- # @con_id[@con] = k #id
64
- # #print "'#{@con}' => #{k},\n"
65
- # #print "'#{@con}' => #{v['verb_id']},\n"
66
- # #print "'#{@con}' => #{v['tiempo_id']},\n"
67
- # #@tiempos[@con] = v['tiempo_id'] #tiempo_id
68
- # #@verbs[@con] = v['verb_id'] #verb_id
69
- # end
70
-
71
- a=Array.new
72
- a = phrase.split(/\s+/)
73
- a.length.times do |i|
74
- a[i].gsub!(/[!.?\(\)]/,"") if a[i] #remove any punctuation from the word
75
- if @con_id[a[i]] then #if word matches a conjugation
76
- @tense_regexes.each do |k,v|
77
- v.each do |regex|
78
- regex = regex.to_s.gsub("search", "#{a[i]}")
79
-
80
- phrase, list, ranges = scan_phrase(phrase, list, regex, a[i], k, i, ranges)
61
+
62
+
63
+
64
+
65
+ # # "present perfect" => [/^\s*search\b/i], #arisen
66
+ # # "simple past" => [/^\s*search\b/i] #arose
67
+ }
68
+
69
+
70
+
71
+ def self.get_match_start_index(verb, match, index)
72
+ #get start position of last occurence of verb in match
73
+ verb_index_in_match = match.index /#{verb}(?!.*#{verb})/i
74
+ #count spaces between match start and verb_index_in_match and subtract that from index
75
+ lo = index - match[0,verb_index_in_match].split(/\s+/).size
76
+ hi = lo + match[0,verb_index_in_match].split(/\s+/).size
77
+ return lo, hi
78
+ end
79
+
80
+ def self.get_clauses(phrase, format = String.new, verbs=nil, tiempo=nil, id_tiempo=nil, tense_id=nil, con_id=nil)
81
+ @format = format
82
+ phrase = phrase.downcase
83
+ #list = format.match("audioverb") ? Hash.new : Array.new
84
+ list = Hash.new
85
+ @verbs ||= get_verbs
86
+ @tiempos ||= get_tiempos
87
+ @id_tiempo ||= get_id_tiempos
88
+ @tense_id ||= get_tenses
89
+ @con_id ||= get_con_id
90
+ ranges = []
91
+ a=Array.new
92
+ a = phrase.split(/\s+/)
93
+ a.length.times do |i|
94
+ a[i].gsub!(/[!.?\(\)]/,"") if a[i] #remove any punctuation from the word
95
+ if @con_id[a[i]] then #if word matches a conjugation
96
+ @tense_regexes.each do |k,v|
97
+ if k.match(/#{@id_tiempo[@tiempos[a[i]]]}/)
98
+ v.each do |tense, regex_array|
99
+ regex_array.each do |regex|
100
+ regex = regex.to_s.gsub("search", "#{a[i]}")
101
+ phrase, list, ranges = scan_phrase(phrase, list, regex, a[i], tense, i, ranges)
102
+
103
+ end
81
104
  end
82
105
  end
83
- end #end if is conjugation
84
- end#end of looping through each cap
85
- @list
106
+ end
107
+ end #end if is conjugation
108
+ end#end of looping through each cap
109
+ list.each do |k, v|
110
+ list.delete(k) unless ranges.include?(v)
86
111
  end
112
+ print "#{list}\n"
113
+ list
87
114
  end
115
+
116
+ def self.scan_phrase(phrase, list, regex, verb, tense_label, index, ranges)
117
+ if match = phrase.match(/#{regex}/i)
118
+ match = match.to_s
119
+ lo, hi = get_match_start_index(verb, match, index)
120
+ ranges = prioritize_ranges(ranges, lo, hi,match)
121
+ list[@tense_id["#{tense_label}"].to_s+":" + match.to_s + ":" + @verbs[verb].to_s] = (lo..hi) if @format.match(/audioverb/)
122
+ list["#{tense_label} :" + match.to_s + ":" + @verbs[verb].to_s] = (lo..hi) unless @format.match(/audioverb/)
123
+ end
124
+ return phrase, list, ranges
125
+ end
126
+
127
+ def self.prioritize_ranges(ranges, lo, hi,match)
128
+ range = (lo..hi)
129
+
130
+ ranges.size.times.each do |r|
131
+ #replace old range with new one if start is same point and new range is longer
132
+ if ranges[r].begin == lo and ranges[r].count < range.count
133
+ ranges[r] = range
134
+ elsif (range.include?(ranges[r].begin) || range.include?(ranges[r].end)) && range.count > ranges[r].count
135
+ ranges.delete_at(r)
136
+ end
137
+ end
138
+ #add range to ranges if it is not already included in an existing range
139
+ if ranges.each.select{|r| r.include?(lo) || r.include?(hi)}.size == 0
140
+ ranges << range
141
+ end
142
+
143
+ ranges
144
+ end
88
145
  end
146
+
147
+ # ####For generating conjugations.rb content
148
+ # @conjugations = get_conjugations
149
+ # @conjugations.each do |k,v|
150
+ # @con = v['con']
151
+ # @con_id[@con] = k #id
152
+ # #print "'#{@con}' => #{k},\n"
153
+ # #print "'#{@con}' => #{v['verb_id']},\n"
154
+ # #print "'#{@con}' => #{v['tiempo_id']},\n"
155
+ # #@tiempos[@con] = v['tiempo_id'] #tiempo_id
156
+ # #@verbs[@con] = v['verb_id'] #verb_id
157
+ # end
158
+
data/lib/conjugations.rb CHANGED
@@ -2,10 +2,11 @@
2
2
 
3
3
  def get_id_tiempos
4
4
  id_tiempo = {
5
- 5 => "infinitive",
6
- 6 => "gerund",
7
- 7 => "participle",
8
- 8 => "past"
5
+ 6 => "infinitive",
6
+ 7 => "gerund",
7
+ 8 => "participle",
8
+ 9 => "past",
9
+ 10 => "third"
9
10
  }
10
11
  end
11
12
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: clause_extractor
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.5
4
+ version: 0.0.6
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-08-25 00:00:00.000000000 Z
12
+ date: 2012-09-08 00:00:00.000000000 Z
13
13
  dependencies: []
14
14
  description: A simple hello world gem
15
15
  email: mikefabrikant@gmail.com