lexical_units 0.0.7 → 0.0.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/README.md +3 -1
- data/lexical_units.gemspec +1 -1
- data/lib/lexical_units/sentences.rb +5 -2
- data/lib/lexical_units/string.rb +7 -2
- data/lib/lexical_units/syllables.rb +2 -0
- data/lib/lexical_units/version.rb +2 -1
- data/lib/lexical_units/words.rb +12 -11
- data/lib/lexical_units/words_without_digits.rb +7 -2
- data/lib/lexical_units.rb +6 -9
- data/spec/lexical_units/sentences_spec.rb +21 -20
- data/spec/lexical_units/string_spec.rb +15 -5
- data/spec/lexical_units/syllables_spec.rb +1 -2
- data/spec/lexical_units/words_spec.rb +54 -30
- data/spec/lexical_units/words_without_digits_spec.rb +4 -4
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: fedd5fd50283e6f7f2fb2881a8c46326cdeffb7b
|
4
|
+
data.tar.gz: ec83c68109e1568fdd8b7b1b2e4628ea28038b2c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: b92570fd96372f27851e2dd3c9d771d534ae99f5a49847ba9289dc61f6c716a446ff3148d33bb409c0a38a6953856ee9f71683648d12c4d1da56fd5960de44a1
|
7
|
+
data.tar.gz: f749de922a2cd228733abc2de4690ffd9853e49bbba5533fe46625f41230147e0c78d5db11155b2b40be33287d4a854d597bc34e10747c5105554d337271cacc
|
data/CHANGELOG.md
CHANGED
data/README.md
CHANGED
@@ -1,4 +1,5 @@
|
|
1
|
-
# LexicalUnits [](http://badge.fury.io/rb/lexical_units) [](https://travis-ci.org/fractalsoft/lexical_units) [](https://gemnasium.com/fractalsoft/lexical_units) [](https://coveralls.io/r/fractalsoft/lexical_units)
|
1
|
+
# LexicalUnits [](http://badge.fury.io/rb/lexical_units) [](https://travis-ci.org/fractalsoft/lexical_units) [](https://gemnasium.com/fractalsoft/lexical_units) [](https://coveralls.io/r/fractalsoft/lexical_units) [](http://waffle.io/fractalsoft/lexical_units)
|
2
|
+
|
2
3
|
[](https://coderwall.com/torrocus)
|
3
4
|
|
4
5
|
Lexical unit is a single word, a part of a word, or a chain of words that forms the basic elements of a language's lexicon.
|
@@ -22,6 +23,7 @@ Or install it yourself as:
|
|
22
23
|
```ruby
|
23
24
|
LexicalUnits::words(text)
|
24
25
|
LexicalUnits::sentences(text)
|
26
|
+
LexicalUnits::words_without_digits(text)
|
25
27
|
```
|
26
28
|
|
27
29
|
You can include methods into String class:
|
data/lexical_units.gemspec
CHANGED
@@ -9,7 +9,7 @@ Gem::Specification.new do |spec|
|
|
9
9
|
spec.authors = ["Aleksander Malaszkiewicz"]
|
10
10
|
spec.email = ["info@fractalsoft.org"]
|
11
11
|
spec.summary = %q{Split text into lexical units}
|
12
|
-
spec.homepage = ""
|
12
|
+
spec.homepage = "https://github.com/fractalsoft/lexical_units"
|
13
13
|
spec.license = "MIT"
|
14
14
|
|
15
15
|
spec.files = `git ls-files`.split($/)
|
@@ -1,20 +1,23 @@
|
|
1
1
|
# encoding: utf-8
|
2
|
+
|
3
|
+
# Part of split into sentences
|
2
4
|
module LexicalUnits
|
3
5
|
# Split text into sentences
|
4
6
|
#
|
5
7
|
# self.words("Lorem, ipsum. Dolor?") #=> ["Lorem, ipsum.", "Dolor?"]
|
6
8
|
# self.words("Lorem! Ipsum dolor?") #=> ["Lorem!", "Ipsum dolor?"]
|
7
9
|
def self.sentences(text)
|
8
|
-
separators = LexicalUnits
|
10
|
+
separators = LexicalUnits.sentence_separators
|
9
11
|
regexp = Regexp.new("[^#{separators}]+[#{separators}]{1,3}")
|
10
12
|
text.scan(regexp).map(&:strip)
|
11
13
|
end
|
12
14
|
|
13
15
|
private
|
16
|
+
|
14
17
|
def self.sentence_separators
|
15
18
|
[
|
16
19
|
'\.', '\?', '\!',
|
17
|
-
|
20
|
+
'‽'
|
18
21
|
].join
|
19
22
|
end
|
20
23
|
end
|
data/lib/lexical_units/string.rb
CHANGED
@@ -1,12 +1,17 @@
|
|
1
1
|
# encoding: utf-8
|
2
2
|
module LexicalUnits
|
3
|
+
# Use lexical units inside String class
|
3
4
|
module String
|
4
5
|
def words
|
5
|
-
LexicalUnits
|
6
|
+
LexicalUnits.words(self)
|
6
7
|
end
|
7
8
|
|
8
9
|
def sentences
|
9
|
-
LexicalUnits
|
10
|
+
LexicalUnits.sentences(self)
|
11
|
+
end
|
12
|
+
|
13
|
+
def words_without_digits
|
14
|
+
LexicalUnits.words_without_digits(self)
|
10
15
|
end
|
11
16
|
end
|
12
17
|
end
|
data/lib/lexical_units/words.rb
CHANGED
@@ -1,27 +1,28 @@
|
|
1
1
|
# encoding: utf-8
|
2
|
+
|
3
|
+
# Part of split into words
|
2
4
|
module LexicalUnits
|
3
5
|
# Split text into words
|
4
6
|
#
|
5
|
-
# self.words("Lorem ipsum dolor sit") #=> ["Lorem",
|
7
|
+
# self.words("Lorem ipsum dolor sit") #=> ["Lorem","ipsum", "dolor", "sit"]
|
6
8
|
# self.words("Lorem, ipsum. Dolor?") #=> ["Lorem", "ipsum", "Dolor"]
|
7
9
|
def self.words(text)
|
8
|
-
regexp = Regexp.new("[#{LexicalUnits
|
9
|
-
text.gsub(regexp,
|
10
|
+
regexp = Regexp.new("[#{LexicalUnits.separators}]")
|
11
|
+
text.gsub(regexp, ' ').split(' ')
|
10
12
|
end
|
11
13
|
|
12
14
|
private
|
15
|
+
|
13
16
|
def self.separators
|
14
17
|
[
|
15
|
-
'\,', '\:', '\;',
|
16
|
-
'
|
17
|
-
'\/',
|
18
|
-
'\(', '\)',
|
19
|
-
'\[', '\]',
|
20
|
-
'\>', '\<',
|
21
|
-
'\{', '\}',
|
18
|
+
'\,', '\:', '\;', '\.', '\?', '\!', '\/',
|
19
|
+
'\(', '\)', '\[', '\]', '\>', '\<', '\{', '\}',
|
22
20
|
'\|', '\~',
|
23
21
|
"\¿", "\¡",
|
24
|
-
'\=', '\"'
|
22
|
+
'\=', '\"',
|
23
|
+
"\»", "\«",
|
24
|
+
'\@', '\#',
|
25
|
+
'\+'
|
25
26
|
].join
|
26
27
|
end
|
27
28
|
end
|
@@ -1,16 +1,21 @@
|
|
1
1
|
# encoding: utf-8
|
2
|
+
|
3
|
+
# Part of split into words, but excluding digits
|
2
4
|
module LexicalUnits
|
3
5
|
# Split text into words without digits
|
4
6
|
#
|
5
7
|
# self.words("Lorem 0 ipsum") #=> ["Lorem", "ipsum"]
|
6
8
|
# self.words("Lorem ipsum 100") #=> ["Lorem", "ipsum"]
|
7
9
|
def self.words_without_digits(text)
|
8
|
-
LexicalUnits
|
10
|
+
LexicalUnits.words(text).delete_if { |word| numeric?(word) }
|
9
11
|
end
|
10
12
|
|
11
13
|
private
|
14
|
+
|
12
15
|
def self.numeric?(value)
|
13
16
|
return true if value =~ /^\d+$/
|
14
|
-
true if Float(value)
|
17
|
+
true if Float(value)
|
18
|
+
rescue
|
19
|
+
false
|
15
20
|
end
|
16
21
|
end
|
data/lib/lexical_units.rb
CHANGED
@@ -1,9 +1,6 @@
|
|
1
|
-
require
|
2
|
-
require
|
3
|
-
require
|
4
|
-
require
|
5
|
-
require
|
6
|
-
require
|
7
|
-
|
8
|
-
module LexicalUnits
|
9
|
-
end
|
1
|
+
require 'lexical_units/words'
|
2
|
+
require 'lexical_units/sentences'
|
3
|
+
require 'lexical_units/syllables'
|
4
|
+
require 'lexical_units/words_without_digits'
|
5
|
+
require 'lexical_units/string'
|
6
|
+
require 'lexical_units/version'
|
@@ -2,48 +2,49 @@
|
|
2
2
|
require 'spec_helper'
|
3
3
|
|
4
4
|
describe LexicalUnits do
|
5
|
-
context
|
6
|
-
it
|
7
|
-
text = %q{Lorem ipsum dolor sit amet
|
5
|
+
context '.sentences' do
|
6
|
+
it 'splits text into sentences' do
|
7
|
+
text = %q{Lorem ipsum dolor sit amet. Consectetur adipiscing elit.
|
8
8
|
Fusce ut lacinia lorem. Nullam a sem quam. Duis faucibus tortor in.}
|
9
9
|
array = [
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
10
|
+
'Lorem ipsum dolor sit amet.',
|
11
|
+
'Consectetur adipiscing elit.',
|
12
|
+
'Fusce ut lacinia lorem.',
|
13
|
+
'Nullam a sem quam.',
|
14
|
+
'Duis faucibus tortor in.'
|
14
15
|
]
|
15
16
|
|
16
17
|
subject.sentences(text).should eq(array)
|
17
18
|
end
|
18
19
|
|
19
|
-
it
|
20
|
-
text =
|
20
|
+
it 'splits text with question mark and exclamation mark into sentences' do
|
21
|
+
text = 'Lorem ipsum dolor! Sit amet? Consectetur adipiscing elit.'
|
21
22
|
array = [
|
22
|
-
|
23
|
-
|
24
|
-
|
23
|
+
'Lorem ipsum dolor!',
|
24
|
+
'Sit amet?',
|
25
|
+
'Consectetur adipiscing elit.'
|
25
26
|
]
|
26
27
|
|
27
28
|
subject.sentences(text).should eq(array)
|
28
29
|
end
|
29
30
|
|
30
|
-
it
|
31
|
-
text =
|
31
|
+
it 'splits text with ellipsis into sentences' do
|
32
|
+
text = 'Lorem ipsum dolor, sit amet... Consectetur adipiscing elit.'
|
32
33
|
array = [
|
33
|
-
|
34
|
-
|
34
|
+
'Lorem ipsum dolor, sit amet...',
|
35
|
+
'Consectetur adipiscing elit.'
|
35
36
|
]
|
36
37
|
|
37
38
|
subject.sentences(text).should eq(array)
|
38
39
|
end
|
39
40
|
|
40
|
-
it
|
41
|
+
it 'splits text with interrobangs into sentences' do
|
41
42
|
text = "Say what‽ She's pregnant‽ Who is the father‽‽‽ Really?"
|
42
43
|
array = [
|
43
|
-
|
44
|
+
'Say what‽',
|
44
45
|
"She's pregnant‽",
|
45
|
-
|
46
|
-
|
46
|
+
'Who is the father‽‽‽',
|
47
|
+
'Really?'
|
47
48
|
]
|
48
49
|
|
49
50
|
subject.sentences(text).should eq(array)
|
@@ -2,12 +2,13 @@
|
|
2
2
|
require 'spec_helper'
|
3
3
|
|
4
4
|
describe LexicalUnits::String do
|
5
|
+
# Testing class
|
5
6
|
class String
|
6
7
|
include LexicalUnits::String
|
7
8
|
end
|
8
9
|
|
9
|
-
context
|
10
|
-
it
|
10
|
+
context '#words' do
|
11
|
+
it 'splits String into words' do
|
11
12
|
array = %w(Lorem ipsum dolor sit amet)
|
12
13
|
string = array.join(' ')
|
13
14
|
|
@@ -15,12 +16,21 @@ describe LexicalUnits::String do
|
|
15
16
|
end
|
16
17
|
end
|
17
18
|
|
18
|
-
context
|
19
|
-
it
|
20
|
-
array = [
|
19
|
+
context '#sentences' do
|
20
|
+
it 'splits String into sentences' do
|
21
|
+
array = ['Lorem ipsum!', 'Dolor sit?', 'Amet.']
|
21
22
|
string = array.join
|
22
23
|
|
23
24
|
string.sentences.should eq(array)
|
24
25
|
end
|
25
26
|
end
|
27
|
+
|
28
|
+
context '#words_without_digits' do
|
29
|
+
it 'splits String into words (no ditigs)' do
|
30
|
+
array = %w(Lorem ipsum dolor sit amet)
|
31
|
+
string = 'Lorem 1 ipsum 23 dolor 456 sit 7890 amet'
|
32
|
+
|
33
|
+
string.words_without_digits.should eq(array)
|
34
|
+
end
|
35
|
+
end
|
26
36
|
end
|
@@ -2,102 +2,126 @@
|
|
2
2
|
require 'spec_helper'
|
3
3
|
|
4
4
|
describe LexicalUnits do
|
5
|
-
context
|
5
|
+
context '.words' do
|
6
6
|
before do
|
7
7
|
@array = %w(Lorem ipsum dolor sit amet)
|
8
8
|
end
|
9
9
|
|
10
|
-
it
|
11
|
-
text =
|
10
|
+
it 'splits text with whitespaces into words' do
|
11
|
+
text = 'Lorem ipsum dolor sit amet'
|
12
12
|
|
13
13
|
subject.words(text).should eq(@array)
|
14
14
|
end
|
15
15
|
|
16
|
-
it
|
17
|
-
text =
|
16
|
+
it 'splits text with comma, colon and semicolon into words' do
|
17
|
+
text = 'Lorem ipsum,dolor:sit;amet'
|
18
18
|
|
19
19
|
subject.words(text).should eq(@array)
|
20
20
|
end
|
21
21
|
|
22
|
-
it
|
23
|
-
text =
|
22
|
+
it 'splits text with dot, question mark and exclamation mark into words' do
|
23
|
+
text = 'Lorem ipsum.dolor?sit!amet'
|
24
24
|
|
25
25
|
subject.words(text).should eq(@array)
|
26
26
|
end
|
27
27
|
|
28
|
-
it
|
29
|
-
text =
|
28
|
+
it 'splits other text with whitespaces, comma and dot into words' do
|
29
|
+
text = 'Lorem ipsum dolor sit amet, consectetur adipiscing elit.'
|
30
30
|
array = %w(Lorem ipsum dolor sit amet consectetur adipiscing elit)
|
31
31
|
|
32
32
|
subject.words(text).should eq(array)
|
33
33
|
end
|
34
34
|
|
35
|
-
it
|
36
|
-
text =
|
35
|
+
it 'no splits text with hyphen into words' do
|
36
|
+
text = 'Lorem ipsum dolor-sit amet'
|
37
37
|
array = %w(Lorem ipsum dolor-sit amet)
|
38
38
|
|
39
39
|
subject.words(text).should eq(array)
|
40
40
|
end
|
41
41
|
|
42
|
-
it
|
43
|
-
text =
|
42
|
+
it 'splits text with slash into words' do
|
43
|
+
text = 'Lorem ipsum dolor sit/amet'
|
44
44
|
|
45
45
|
subject.words(text).should eq(@array)
|
46
46
|
end
|
47
47
|
|
48
|
-
it
|
49
|
-
text =
|
48
|
+
it 'splits text with round brackets into words' do
|
49
|
+
text = 'Lorem ipsum(dolor sit)amet'
|
50
50
|
|
51
51
|
subject.words(text).should eq(@array)
|
52
52
|
end
|
53
53
|
|
54
|
-
it
|
55
|
-
text =
|
54
|
+
it 'splits text with square brackets into words' do
|
55
|
+
text = 'Lorem ipsum dolor[sit]amet'
|
56
56
|
|
57
57
|
subject.words(text).should eq(@array)
|
58
58
|
end
|
59
59
|
|
60
|
-
it
|
61
|
-
text =
|
60
|
+
it 'splits text with pointy brackets into words' do
|
61
|
+
text = 'Lorem<ipsum dolor sit>amet'
|
62
62
|
|
63
63
|
subject.words(text).should eq(@array)
|
64
64
|
end
|
65
65
|
|
66
|
-
it
|
67
|
-
text =
|
66
|
+
it 'splits text with braces into words' do
|
67
|
+
text = 'Lorem ipsum{dolor}sit amet'
|
68
68
|
|
69
69
|
subject.words(text).should eq(@array)
|
70
70
|
end
|
71
71
|
|
72
|
-
it
|
73
|
-
text =
|
72
|
+
it 'splits text with vertical bar into words' do
|
73
|
+
text = 'Lorem ipsum|dolor sit amet'
|
74
74
|
|
75
75
|
subject.words(text).should eq(@array)
|
76
76
|
end
|
77
77
|
|
78
|
-
it
|
79
|
-
text =
|
78
|
+
it 'splits text with tilde into words' do
|
79
|
+
text = 'Lorem ipsum dolor~sit amet'
|
80
80
|
|
81
81
|
subject.words(text).should eq(@array)
|
82
82
|
end
|
83
83
|
|
84
84
|
# Spanish
|
85
|
-
it
|
86
|
-
text =
|
85
|
+
it 'splits text with inverted question and exclamation marks into words' do
|
86
|
+
text = 'Lorem¿ipsum?dolor¡sit!amet'
|
87
87
|
|
88
88
|
subject.words(text).should eq(@array)
|
89
89
|
end
|
90
90
|
|
91
|
-
it
|
92
|
-
text =
|
91
|
+
it 'splits text with equals sign into words' do
|
92
|
+
text = 'Lorem ipsum=dolor sit amet'
|
93
93
|
|
94
94
|
subject.words(text).should eq(@array)
|
95
95
|
end
|
96
96
|
|
97
|
-
it
|
97
|
+
it 'splits text with typewriter double quotes into words' do
|
98
98
|
text = %Q(Lorem"ipsum dolor"sit amet)
|
99
99
|
|
100
100
|
subject.words(text).should eq(@array)
|
101
101
|
end
|
102
|
+
|
103
|
+
it 'split text with non-English quotation marks into words' do
|
104
|
+
text = %Q(Lorem»ipsum dolor«sit amet)
|
105
|
+
|
106
|
+
subject.words(text).should eq(@array)
|
107
|
+
end
|
108
|
+
|
109
|
+
it "split text with 'at sign' (@) into words" do
|
110
|
+
text = %Q(Lorem@ipsum dolor sit amet)
|
111
|
+
|
112
|
+
subject.words(text).should eq(@array)
|
113
|
+
end
|
114
|
+
|
115
|
+
it "split text with 'number sign (#) into words" do
|
116
|
+
text = %Q(Lorem ipsum#dolor sit amet)
|
117
|
+
|
118
|
+
subject.words(text).should eq(@array)
|
119
|
+
end
|
120
|
+
|
121
|
+
it 'split text with plus (+) into words' do
|
122
|
+
text = %Q(Lorem+ipsum dolor+sit amet)
|
123
|
+
|
124
|
+
subject.words(text).should eq(@array)
|
125
|
+
end
|
102
126
|
end
|
103
127
|
end
|
@@ -2,13 +2,13 @@
|
|
2
2
|
require 'spec_helper'
|
3
3
|
|
4
4
|
describe LexicalUnits do
|
5
|
-
context
|
5
|
+
context '.words_without_digits' do
|
6
6
|
[
|
7
|
-
{text:
|
8
|
-
{text:
|
7
|
+
{ text: 'Lorem ipsum 12345', array: %w(Lorem ipsum) },
|
8
|
+
{ text: 'dolor 98765 sit amet.', array: %w(dolor sit amet) }
|
9
9
|
].each do |hash|
|
10
10
|
text, array = hash.values
|
11
|
-
it
|
11
|
+
it 'splits text into words without digits' do
|
12
12
|
subject.words_without_digits(text).should eq(array)
|
13
13
|
end
|
14
14
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: lexical_units
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.8
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Aleksander Malaszkiewicz
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2013-
|
11
|
+
date: 2013-08-15 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -70,7 +70,7 @@ files:
|
|
70
70
|
- spec/lexical_units/words_spec.rb
|
71
71
|
- spec/lexical_units/words_without_digits_spec.rb
|
72
72
|
- spec/spec_helper.rb
|
73
|
-
homepage:
|
73
|
+
homepage: https://github.com/fractalsoft/lexical_units
|
74
74
|
licenses:
|
75
75
|
- MIT
|
76
76
|
metadata: {}
|