clause_extractor 0.1.2 → 0.1.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (2) hide show
  1. data/lib/clause_extractor.rb +25 -100
  2. metadata +2 -2
@@ -1,89 +1,8 @@
1
1
  class ClauseExtractor
2
2
  require "conjugations"
3
-
4
- pronouns = "(i|you|he|she|it|they|we|there)"
5
- present_perfect = "(already|ever|for|just|never|since|yet)"
6
- have_has = "(have|has|haven't|hasn't)"
7
- was_were = "(were|was|wasn't|weren't)"
8
- had = "([a-z]{1,4}'d|had)(n't)*"
9
- have_has = "(have|has|haven't|hasn't|havent|hasnt|has not|have not)"
10
- contractions = "(it'*s|he'*s|she'*s|[a-z]{1,4}'*ve)"
11
- to_be = "(am|are|'m|'re|'s|is|[a-z]{1,4}'re)"
12
- will = "(will|[a-z]{1,4}'ll)"
13
- would = "(would|[a-z]{1,4}'d)"
14
-
15
- @tense_regexes = {
16
-
17
- 'third' => {
18
- "simple present" => [
19
- /\b(he|she|it)\s+search(s)?\b/i, #he arrives
20
- /\bsearch(s)?\s+(it|them|him|her|me|you|us)\b/i #adapts it
21
- ]
22
- },
23
- 'infinitive' => {
24
- "simple present" => [/\b((I|you|they|we|to)\s+)*+search\b/i],#to arrive
25
-
26
-
27
- "subjunctive future" => [
28
- /\bif\s+#{pronouns}\s+#{was_were}\s+(not\s+)*to\s+(not\s+)*search/i, #if I were to arise
29
- /\bif\s+#{pronouns}\s+should(n't)*\s+(not\s+)*search/i #If I should arise
30
- ],
31
- "subjunctive present" => [ /\bthat\s+#{pronouns}\s+(not\s+)*search/i], #that we arrive
32
-
33
- "conditional simple" => [ /\b(#{pronouns}\s+)*(would(n't)*|[a-z]{1,4}'d)(\s+not)*\s+search/i], #I would arise, I wouldn't arise
34
-
35
- "will-future" => [ /\b(#{pronouns}\s+)*(will|[a-z]{1,4}'ll)(\s+not)*\s+search/i], #I'll arise
36
-
37
- "going to-future" => [ /\b(#{pronouns}\s+)*#{to_be}\s+(not\s+)*going\s+to\s+search/i], #they are going to cry
38
- },
39
- 'gerund' => {
40
- "conditional perfect progressive" => [ /\b(#{pronouns}\s+)*would\s+(not\s+)*have\s+(not\s+)*been\s+search/i], #I would have been searching
41
- "present perfect progressive" => [
42
- /\b(#{have_has}\s+)(#{pronouns}\s+)*(not\s+)*(#{present_perfect}\s+)*been\s+search/i, #have they not been searching
43
- /\b(#{pronouns}\s+)*#{have_has}*\s+(not\s+)*(#{present_perfect}\s+)*been\s+search/i #I have been searching
44
- ],
45
- "past perfect progressive" => [
46
- /\b(#{pronouns}\s+)*#{had}\s(not\s+)*(#{present_perfect}\s+)*been\s+search/i, #I had been searching,
47
- /\bhad(n't)*\s+(#{pronouns}\s+)*(not\s+)*(#{present_perfect}\s+)*been\s+search/i, #had he not been searching
48
-
49
- ],
50
-
51
- "conditional progressive" => [/\b(#{pronouns}\s+)*#{would}\s+(not\s+)*be\s+search/i], #I would be searching (I'd)
52
- "future progressive" => [
53
- /\b((#{pronouns})\s+)*#{will}\s+(not\s+)*be\s+search/i,
54
- /\bwill\s+(#{pronouns}\s+)(not\s+)*be\s+search/i,
55
- ], #I will be searching
56
- "past progressive" => [/\b(#{pronouns}\s+)*#{was_were}*\s+(not\s+)*search/i], #I was searching
57
-
58
- "present progressive" => [/\b(#{pronouns}\s*)*(#{to_be}\s+)*(not\s+)*search/i], #I'm rising
59
- },
60
- "past-participle" => {
61
- "conditional perfect" => [/\b(#{pronouns}\s+)*#{would}\s+(not\s+)*have\s+(not\s+)*search/i], #I would not search
62
- "future perfect" => [/\b(#{pronouns}\s+)*#{will}\s+have\s+search/i], #I'll have arisen
63
- "past perfect" => [
64
- /\b(#{pronouns}\s+)*#{had}\s+(not\s+)*((#{present_perfect})\s+)*search/i, #I had arisen
65
- /\b#{had}\s+(#{pronouns}\s+)*(not\s+)*((#{present_perfect})\s+)*search/i
66
- ],
67
- "present perfect" => [
68
- /\b(#{pronouns}\s+)*#{have_has}\s+((#{present_perfect})\s+)*search/, #They have already seen
69
- /\b#{have_has}\s+(#{pronouns}\s+)*(not\s+)*(#{present_perfect}\s+)*search/ #Have they already seen
70
- ],
71
- "subjunctive past" => [/\bif\s+#{pronouns}\s+search/i], #if I arose
72
- "simple past" => [/\b#{pronouns}\s+search/i] #you chose
73
- },
74
- #"present perfect" => [/^\s*search\b/i], #arisen
75
- #"simple past" => [/^\s*search\b/i] #arose
76
- }
77
- def self.get_match_start_index(verb, match, index)
78
- #get start position of last occurence of verb in match
79
- verb_index_in_match = match.index /#{verb}(?!.*#{verb})/i
80
- #count spaces between match start and verb_index_in_match and subtract that from index
81
- lo = index - match[0,verb_index_in_match].split(/\s+/).size
82
- hi = lo + match[0,verb_index_in_match].split(/\s+/).size
83
- return lo, hi
84
- end
3
+ require "matchers"
85
4
 
86
- def self.get_clauses(phrase, format = String.new, verbs=nil, tiempo=nil, id_tiempo=nil, tense_id=nil, con_id=nil)
5
+ def self.get_clauses(phrase, format = String.new)
87
6
  @format = format
88
7
  phrase = phrase.downcase
89
8
  #list = format.match("audioverb") ? Hash.new : Array.new
@@ -94,32 +13,38 @@ class ClauseExtractor
94
13
  @tense_id ||= get_tenses
95
14
  @con_id ||= get_con_id
96
15
  ranges = []
97
- a=Array.new
98
- a = phrase.split(/\s+/)
99
- a.length.times do |i|
100
- a[i].gsub!(/[!.?\(\)]/,"") if a[i] #remove any punctuation from the word
101
- if @con_id[a[i]] then #if word matches a conjugation
102
- @tense_regexes.each do |k,v|
103
- if k.match(/#{@id_tiempo[@tiempos[a[i]]]}/)
16
+
17
+ phrase.gsub!(/[!.?\(\)]/,"") if phrase
18
+ phrase_a = phrase.split(/\s+/)
19
+ phrase_a.length.times do |i|
20
+ # phrase_a[i].gsub!(/[!.?\(\)]/,"") if phrase_a[i] #remove any punctuation from the word
21
+ if @con_id[phrase_a[i]] then #if word matches a conjugation
22
+ $tense_regexes.each do |k,v|
23
+ if k.match(/#{@id_tiempo[@tiempos[phrase_a[i]]]}/)
104
24
  v.each do |tense, regex_array|
105
25
  regex_array.each do |regex|
106
- regex = regex.to_s.gsub("search", "#{a[i]}")
107
- phrase, list, ranges = scan_phrase(phrase, list, regex, a[i], tense, i, ranges)
26
+ regex = regex.to_s.gsub("search", "#{phrase_a[i]}")
27
+ phrase, list, ranges = scan_phrase(phrase, list, regex, phrase_a[i], tense, i, ranges)
108
28
  end
109
29
  end
110
30
  end
111
31
  end
112
32
  end
113
- end
114
- list.each do |k, v|
115
- list.delete(k) unless ranges.include?(v)
116
- end
117
- list.each do |k,v|
118
- print "#{k}\n"
119
- end
33
+ end
34
+ list.each { |k, v| list.delete(k) unless ranges.include?(v) }
35
+ list.each { |k, v| print "#{k}\n" }
120
36
  list
121
37
  end
122
38
 
39
+ def self.get_match_start_index(verb, match, index)
40
+ #get start position of last occurence of verb in match
41
+ verb_index_in_match = match.index /#{verb}(?!.*#{verb})/i
42
+ #count spaces between match start and verb_index_in_match and subtract that from index
43
+ lo = index - match[0,verb_index_in_match].split(/\s+/).size
44
+ hi = lo + match[0,verb_index_in_match].split(/\s+/).size
45
+ return lo, hi
46
+ end
47
+
123
48
  def self.scan_phrase(phrase, list, regex, verb, tense_label, index, ranges)
124
49
  if match = phrase.match(/#{regex}/i)
125
50
  match = match.to_s
@@ -128,7 +53,7 @@ class ClauseExtractor
128
53
  if @format.match(/audioverb/)
129
54
  list[@tense_id["#{tense_label}"].to_s+":" + match.to_s + ":" + @verbs[verb].to_s] = (lo..hi)
130
55
  else
131
- list["#{tense_label}:" + match.to_s + ":" + (lo..hi).to_s] = (lo..hi) unless @format.match(/audioverb/)
56
+ list["#{tense_label}:" + match.to_s + ":" + (lo..hi).to_s] = (lo..hi)
132
57
  end
133
58
  end
134
59
  return phrase, list, ranges
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: clause_extractor
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.2
4
+ version: 0.1.3
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-09-08 00:00:00.000000000 Z
12
+ date: 2012-10-06 00:00:00.000000000 Z
13
13
  dependencies: []
14
14
  description: English verbal clause extractor
15
15
  email: mikefabrikant@gmail.com