lexical_units 0.0.1 → 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +7 -0
- data/README.md +8 -0
- data/lib/lexical_units.rb +2 -0
- data/lib/lexical_units/sentences.rb +19 -0
- data/lib/lexical_units/string.rb +11 -0
- data/lib/lexical_units/version.rb +1 -1
- data/spec/lexical_units/sentences_spec.rb +55 -0
- data/spec/lexical_units/string_spec.rb +25 -0
- metadata +9 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 35f8950c1907e3f92afb2a3dc64ee3506779d87c
|
4
|
+
data.tar.gz: 3b08b4af5ada8ee6abaad85621b7f0cfd5850df7
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7d610db7578359ee8724beea9f019b4b62e514ece052fa4d6d5b417af1069c58818387329c18b97145761522fc52ab613f20ea8e5d079b097baadad7102a83fe
|
7
|
+
data.tar.gz: eb0fd0fca6c27dbcb4169a97f4e18969f06f204b613866cee58a0365102603c131643b711bf9c42d86d06d29e50aceed173b0fee785081c073ed0b1db8dabaf9
|
data/CHANGELOG.md
ADDED
data/README.md
CHANGED
@@ -20,6 +20,14 @@ Or install it yourself as:
|
|
20
20
|
|
21
21
|
```ruby
|
22
22
|
LexicalUnits::words(text)
|
23
|
+
LexicalUnits::sentences(text)
|
24
|
+
```
|
25
|
+
|
26
|
+
You can include methods into String class:
|
27
|
+
```ruby
|
28
|
+
class String
|
29
|
+
include LexicalUnits::String
|
30
|
+
end
|
23
31
|
```
|
24
32
|
|
25
33
|
## Contributing
|
data/lib/lexical_units.rb
CHANGED
@@ -0,0 +1,19 @@
|
|
1
|
+
module LexicalUnits
|
2
|
+
# Split text into sentences
|
3
|
+
#
|
4
|
+
# self.words("Lorem, ipsum. Dolor?") #=> ["Lorem, ipsum.", "Dolor?"]
|
5
|
+
# self.words("Lorem! Ipsum dolor?") #=> ["Lorem!", "Ipsum dolor?"]
|
6
|
+
def self.sentences(text)
|
7
|
+
separators = LexicalUnits::sentence_separators
|
8
|
+
regexp = Regexp.new("[^#{separators}]+[#{separators}]{1,3}")
|
9
|
+
text.scan(regexp).map(&:strip)
|
10
|
+
end
|
11
|
+
|
12
|
+
private
|
13
|
+
def self.sentence_separators
|
14
|
+
[
|
15
|
+
'\.', '\?', '\!',
|
16
|
+
'‽'
|
17
|
+
].join
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,55 @@
|
|
1
|
+
#encoding: utf-8
|
2
|
+
require 'spec_helper'
|
3
|
+
|
4
|
+
describe LexicalUnits do
|
5
|
+
context ".sentences" do
|
6
|
+
let(:klass) { LexicalUnits }
|
7
|
+
|
8
|
+
it "splits text into sentences" do
|
9
|
+
text = %q{Lorem ipsum dolor sit amet, consectetur adipiscing elit.
|
10
|
+
Fusce ut lacinia lorem. Nullam a sem quam. Duis faucibus tortor in.}
|
11
|
+
array = [
|
12
|
+
"Lorem ipsum dolor sit amet, consectetur adipiscing elit.",
|
13
|
+
"Fusce ut lacinia lorem.",
|
14
|
+
"Nullam a sem quam.",
|
15
|
+
"Duis faucibus tortor in."
|
16
|
+
]
|
17
|
+
|
18
|
+
klass::sentences(text).should eq(array)
|
19
|
+
end
|
20
|
+
|
21
|
+
it "splits text with question mark and exclamation mark into sentences" do
|
22
|
+
text = "Lorem ipsum dolor! Sit amet? Consectetur adipiscing elit."
|
23
|
+
array = [
|
24
|
+
"Lorem ipsum dolor!",
|
25
|
+
"Sit amet?",
|
26
|
+
"Consectetur adipiscing elit."
|
27
|
+
]
|
28
|
+
|
29
|
+
klass::sentences(text).should eq(array)
|
30
|
+
end
|
31
|
+
|
32
|
+
it "splits text with ellipsis into sentences" do
|
33
|
+
text = "Lorem ipsum dolor, sit amet... Consectetur adipiscing elit."
|
34
|
+
array = [
|
35
|
+
"Lorem ipsum dolor, sit amet...",
|
36
|
+
"Consectetur adipiscing elit."
|
37
|
+
]
|
38
|
+
|
39
|
+
klass::sentences(text).should eq(array)
|
40
|
+
end
|
41
|
+
|
42
|
+
it "splits text with interrobangs into sentences" do
|
43
|
+
text = "Say what‽ She's pregnant‽ Who is the father‽‽‽ Really?"
|
44
|
+
array = [
|
45
|
+
"Say what‽",
|
46
|
+
"She's pregnant‽",
|
47
|
+
"Who is the father‽‽‽",
|
48
|
+
"Really?"
|
49
|
+
]
|
50
|
+
|
51
|
+
klass::sentences(text).should eq(array)
|
52
|
+
end
|
53
|
+
|
54
|
+
end
|
55
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe LexicalUnits::String do
|
4
|
+
class String
|
5
|
+
include LexicalUnits::String
|
6
|
+
end
|
7
|
+
|
8
|
+
context "#words" do
|
9
|
+
it "splits String into words" do
|
10
|
+
array = %w(Lorem ipsum dolor sit amet)
|
11
|
+
string = array.join(' ')
|
12
|
+
|
13
|
+
string.words.should eq(array)
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
context "#sentences" do
|
18
|
+
it "splits String into sentences" do
|
19
|
+
array = ["Lorem ipsum!", "Dolor sit?", "Amet."]
|
20
|
+
string = array.join
|
21
|
+
|
22
|
+
string.sentences.should eq(array)
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: lexical_units
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Aleksander Malaszkiewicz
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2013-07-
|
11
|
+
date: 2013-07-07 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -50,6 +50,7 @@ files:
|
|
50
50
|
- .ruby-gemset
|
51
51
|
- .ruby-version
|
52
52
|
- .travis.yml
|
53
|
+
- CHANGELOG.md
|
53
54
|
- Gemfile
|
54
55
|
- Guardfile
|
55
56
|
- LICENSE.txt
|
@@ -57,8 +58,12 @@ files:
|
|
57
58
|
- Rakefile
|
58
59
|
- lexical_units.gemspec
|
59
60
|
- lib/lexical_units.rb
|
61
|
+
- lib/lexical_units/sentences.rb
|
62
|
+
- lib/lexical_units/string.rb
|
60
63
|
- lib/lexical_units/version.rb
|
61
64
|
- lib/lexical_units/words.rb
|
65
|
+
- spec/lexical_units/sentences_spec.rb
|
66
|
+
- spec/lexical_units/string_spec.rb
|
62
67
|
- spec/lexical_units/words_spec.rb
|
63
68
|
- spec/spec_helper.rb
|
64
69
|
homepage: ''
|
@@ -86,5 +91,7 @@ signing_key:
|
|
86
91
|
specification_version: 4
|
87
92
|
summary: Split text into lexical units
|
88
93
|
test_files:
|
94
|
+
- spec/lexical_units/sentences_spec.rb
|
95
|
+
- spec/lexical_units/string_spec.rb
|
89
96
|
- spec/lexical_units/words_spec.rb
|
90
97
|
- spec/spec_helper.rb
|