lexical_units 0.0.7 → 0.0.8
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/README.md +3 -1
- data/lexical_units.gemspec +1 -1
- data/lib/lexical_units/sentences.rb +5 -2
- data/lib/lexical_units/string.rb +7 -2
- data/lib/lexical_units/syllables.rb +2 -0
- data/lib/lexical_units/version.rb +2 -1
- data/lib/lexical_units/words.rb +12 -11
- data/lib/lexical_units/words_without_digits.rb +7 -2
- data/lib/lexical_units.rb +6 -9
- data/spec/lexical_units/sentences_spec.rb +21 -20
- data/spec/lexical_units/string_spec.rb +15 -5
- data/spec/lexical_units/syllables_spec.rb +1 -2
- data/spec/lexical_units/words_spec.rb +54 -30
- data/spec/lexical_units/words_without_digits_spec.rb +4 -4
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: fedd5fd50283e6f7f2fb2881a8c46326cdeffb7b
|
4
|
+
data.tar.gz: ec83c68109e1568fdd8b7b1b2e4628ea28038b2c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: b92570fd96372f27851e2dd3c9d771d534ae99f5a49847ba9289dc61f6c716a446ff3148d33bb409c0a38a6953856ee9f71683648d12c4d1da56fd5960de44a1
|
7
|
+
data.tar.gz: f749de922a2cd228733abc2de4690ffd9853e49bbba5533fe46625f41230147e0c78d5db11155b2b40be33287d4a854d597bc34e10747c5105554d337271cacc
|
data/CHANGELOG.md
CHANGED
data/README.md
CHANGED
@@ -1,4 +1,5 @@
|
|
1
|
-
# LexicalUnits [![Gem Version](https://badge.fury.io/rb/lexical_units.png)](http://badge.fury.io/rb/lexical_units) [![Build Status](https://travis-ci.org/fractalsoft/lexical_units.png)](https://travis-ci.org/fractalsoft/lexical_units) [![Dependency Status](https://gemnasium.com/fractalsoft/lexical_units.png)](https://gemnasium.com/fractalsoft/lexical_units) [![Coverage Status](https://coveralls.io/repos/fractalsoft/lexical_units/badge.png)](https://coveralls.io/r/fractalsoft/lexical_units)
|
1
|
+
# LexicalUnits [![Gem Version](https://badge.fury.io/rb/lexical_units.png)](http://badge.fury.io/rb/lexical_units) [![Build Status](https://travis-ci.org/fractalsoft/lexical_units.png)](https://travis-ci.org/fractalsoft/lexical_units) [![Dependency Status](https://gemnasium.com/fractalsoft/lexical_units.png)](https://gemnasium.com/fractalsoft/lexical_units) [![Coverage Status](https://coveralls.io/repos/fractalsoft/lexical_units/badge.png)](https://coveralls.io/r/fractalsoft/lexical_units) [![Stories in Ready](https://badge.waffle.io/fractalsoft/lexical_units.png)](http://waffle.io/fractalsoft/lexical_units)
|
2
|
+
|
2
3
|
[![endorse](https://api.coderwall.com/torrocus/endorsecount.png)](https://coderwall.com/torrocus)
|
3
4
|
|
4
5
|
Lexical unit is a single word, a part of a word, or a chain of words that forms the basic elements of a language's lexicon.
|
@@ -22,6 +23,7 @@ Or install it yourself as:
|
|
22
23
|
```ruby
|
23
24
|
LexicalUnits::words(text)
|
24
25
|
LexicalUnits::sentences(text)
|
26
|
+
LexicalUnits::words_without_digits(text)
|
25
27
|
```
|
26
28
|
|
27
29
|
You can include methods into String class:
|
data/lexical_units.gemspec
CHANGED
@@ -9,7 +9,7 @@ Gem::Specification.new do |spec|
|
|
9
9
|
spec.authors = ["Aleksander Malaszkiewicz"]
|
10
10
|
spec.email = ["info@fractalsoft.org"]
|
11
11
|
spec.summary = %q{Split text into lexical units}
|
12
|
-
spec.homepage = ""
|
12
|
+
spec.homepage = "https://github.com/fractalsoft/lexical_units"
|
13
13
|
spec.license = "MIT"
|
14
14
|
|
15
15
|
spec.files = `git ls-files`.split($/)
|
@@ -1,20 +1,23 @@
|
|
1
1
|
# encoding: utf-8
|
2
|
+
|
3
|
+
# Part of split into sentences
|
2
4
|
module LexicalUnits
|
3
5
|
# Split text into sentences
|
4
6
|
#
|
5
7
|
# self.words("Lorem, ipsum. Dolor?") #=> ["Lorem, ipsum.", "Dolor?"]
|
6
8
|
# self.words("Lorem! Ipsum dolor?") #=> ["Lorem!", "Ipsum dolor?"]
|
7
9
|
def self.sentences(text)
|
8
|
-
separators = LexicalUnits
|
10
|
+
separators = LexicalUnits.sentence_separators
|
9
11
|
regexp = Regexp.new("[^#{separators}]+[#{separators}]{1,3}")
|
10
12
|
text.scan(regexp).map(&:strip)
|
11
13
|
end
|
12
14
|
|
13
15
|
private
|
16
|
+
|
14
17
|
def self.sentence_separators
|
15
18
|
[
|
16
19
|
'\.', '\?', '\!',
|
17
|
-
|
20
|
+
'‽'
|
18
21
|
].join
|
19
22
|
end
|
20
23
|
end
|
data/lib/lexical_units/string.rb
CHANGED
@@ -1,12 +1,17 @@
|
|
1
1
|
# encoding: utf-8
|
2
2
|
module LexicalUnits
|
3
|
+
# Use lexical units inside String class
|
3
4
|
module String
|
4
5
|
def words
|
5
|
-
LexicalUnits
|
6
|
+
LexicalUnits.words(self)
|
6
7
|
end
|
7
8
|
|
8
9
|
def sentences
|
9
|
-
LexicalUnits
|
10
|
+
LexicalUnits.sentences(self)
|
11
|
+
end
|
12
|
+
|
13
|
+
def words_without_digits
|
14
|
+
LexicalUnits.words_without_digits(self)
|
10
15
|
end
|
11
16
|
end
|
12
17
|
end
|
data/lib/lexical_units/words.rb
CHANGED
@@ -1,27 +1,28 @@
|
|
1
1
|
# encoding: utf-8
|
2
|
+
|
3
|
+
# Part of split into words
|
2
4
|
module LexicalUnits
|
3
5
|
# Split text into words
|
4
6
|
#
|
5
|
-
# self.words("Lorem ipsum dolor sit") #=> ["Lorem",
|
7
|
+
# self.words("Lorem ipsum dolor sit") #=> ["Lorem","ipsum", "dolor", "sit"]
|
6
8
|
# self.words("Lorem, ipsum. Dolor?") #=> ["Lorem", "ipsum", "Dolor"]
|
7
9
|
def self.words(text)
|
8
|
-
regexp = Regexp.new("[#{LexicalUnits
|
9
|
-
text.gsub(regexp,
|
10
|
+
regexp = Regexp.new("[#{LexicalUnits.separators}]")
|
11
|
+
text.gsub(regexp, ' ').split(' ')
|
10
12
|
end
|
11
13
|
|
12
14
|
private
|
15
|
+
|
13
16
|
def self.separators
|
14
17
|
[
|
15
|
-
'\,', '\:', '\;',
|
16
|
-
'
|
17
|
-
'\/',
|
18
|
-
'\(', '\)',
|
19
|
-
'\[', '\]',
|
20
|
-
'\>', '\<',
|
21
|
-
'\{', '\}',
|
18
|
+
'\,', '\:', '\;', '\.', '\?', '\!', '\/',
|
19
|
+
'\(', '\)', '\[', '\]', '\>', '\<', '\{', '\}',
|
22
20
|
'\|', '\~',
|
23
21
|
"\¿", "\¡",
|
24
|
-
'\=', '\"'
|
22
|
+
'\=', '\"',
|
23
|
+
"\»", "\«",
|
24
|
+
'\@', '\#',
|
25
|
+
'\+'
|
25
26
|
].join
|
26
27
|
end
|
27
28
|
end
|
@@ -1,16 +1,21 @@
|
|
1
1
|
# encoding: utf-8
|
2
|
+
|
3
|
+
# Part of split into words, but excluding digits
|
2
4
|
module LexicalUnits
|
3
5
|
# Split text into words without digits
|
4
6
|
#
|
5
7
|
# self.words("Lorem 0 ipsum") #=> ["Lorem", "ipsum"]
|
6
8
|
# self.words("Lorem ipsum 100") #=> ["Lorem", "ipsum"]
|
7
9
|
def self.words_without_digits(text)
|
8
|
-
LexicalUnits
|
10
|
+
LexicalUnits.words(text).delete_if { |word| numeric?(word) }
|
9
11
|
end
|
10
12
|
|
11
13
|
private
|
14
|
+
|
12
15
|
def self.numeric?(value)
|
13
16
|
return true if value =~ /^\d+$/
|
14
|
-
true if Float(value)
|
17
|
+
true if Float(value)
|
18
|
+
rescue
|
19
|
+
false
|
15
20
|
end
|
16
21
|
end
|
data/lib/lexical_units.rb
CHANGED
@@ -1,9 +1,6 @@
|
|
1
|
-
require
|
2
|
-
require
|
3
|
-
require
|
4
|
-
require
|
5
|
-
require
|
6
|
-
require
|
7
|
-
|
8
|
-
module LexicalUnits
|
9
|
-
end
|
1
|
+
require 'lexical_units/words'
|
2
|
+
require 'lexical_units/sentences'
|
3
|
+
require 'lexical_units/syllables'
|
4
|
+
require 'lexical_units/words_without_digits'
|
5
|
+
require 'lexical_units/string'
|
6
|
+
require 'lexical_units/version'
|
@@ -2,48 +2,49 @@
|
|
2
2
|
require 'spec_helper'
|
3
3
|
|
4
4
|
describe LexicalUnits do
|
5
|
-
context
|
6
|
-
it
|
7
|
-
text = %q{Lorem ipsum dolor sit amet
|
5
|
+
context '.sentences' do
|
6
|
+
it 'splits text into sentences' do
|
7
|
+
text = %q{Lorem ipsum dolor sit amet. Consectetur adipiscing elit.
|
8
8
|
Fusce ut lacinia lorem. Nullam a sem quam. Duis faucibus tortor in.}
|
9
9
|
array = [
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
10
|
+
'Lorem ipsum dolor sit amet.',
|
11
|
+
'Consectetur adipiscing elit.',
|
12
|
+
'Fusce ut lacinia lorem.',
|
13
|
+
'Nullam a sem quam.',
|
14
|
+
'Duis faucibus tortor in.'
|
14
15
|
]
|
15
16
|
|
16
17
|
subject.sentences(text).should eq(array)
|
17
18
|
end
|
18
19
|
|
19
|
-
it
|
20
|
-
text =
|
20
|
+
it 'splits text with question mark and exclamation mark into sentences' do
|
21
|
+
text = 'Lorem ipsum dolor! Sit amet? Consectetur adipiscing elit.'
|
21
22
|
array = [
|
22
|
-
|
23
|
-
|
24
|
-
|
23
|
+
'Lorem ipsum dolor!',
|
24
|
+
'Sit amet?',
|
25
|
+
'Consectetur adipiscing elit.'
|
25
26
|
]
|
26
27
|
|
27
28
|
subject.sentences(text).should eq(array)
|
28
29
|
end
|
29
30
|
|
30
|
-
it
|
31
|
-
text =
|
31
|
+
it 'splits text with ellipsis into sentences' do
|
32
|
+
text = 'Lorem ipsum dolor, sit amet... Consectetur adipiscing elit.'
|
32
33
|
array = [
|
33
|
-
|
34
|
-
|
34
|
+
'Lorem ipsum dolor, sit amet...',
|
35
|
+
'Consectetur adipiscing elit.'
|
35
36
|
]
|
36
37
|
|
37
38
|
subject.sentences(text).should eq(array)
|
38
39
|
end
|
39
40
|
|
40
|
-
it
|
41
|
+
it 'splits text with interrobangs into sentences' do
|
41
42
|
text = "Say what‽ She's pregnant‽ Who is the father‽‽‽ Really?"
|
42
43
|
array = [
|
43
|
-
|
44
|
+
'Say what‽',
|
44
45
|
"She's pregnant‽",
|
45
|
-
|
46
|
-
|
46
|
+
'Who is the father‽‽‽',
|
47
|
+
'Really?'
|
47
48
|
]
|
48
49
|
|
49
50
|
subject.sentences(text).should eq(array)
|
@@ -2,12 +2,13 @@
|
|
2
2
|
require 'spec_helper'
|
3
3
|
|
4
4
|
describe LexicalUnits::String do
|
5
|
+
# Testing class
|
5
6
|
class String
|
6
7
|
include LexicalUnits::String
|
7
8
|
end
|
8
9
|
|
9
|
-
context
|
10
|
-
it
|
10
|
+
context '#words' do
|
11
|
+
it 'splits String into words' do
|
11
12
|
array = %w(Lorem ipsum dolor sit amet)
|
12
13
|
string = array.join(' ')
|
13
14
|
|
@@ -15,12 +16,21 @@ describe LexicalUnits::String do
|
|
15
16
|
end
|
16
17
|
end
|
17
18
|
|
18
|
-
context
|
19
|
-
it
|
20
|
-
array = [
|
19
|
+
context '#sentences' do
|
20
|
+
it 'splits String into sentences' do
|
21
|
+
array = ['Lorem ipsum!', 'Dolor sit?', 'Amet.']
|
21
22
|
string = array.join
|
22
23
|
|
23
24
|
string.sentences.should eq(array)
|
24
25
|
end
|
25
26
|
end
|
27
|
+
|
28
|
+
context '#words_without_digits' do
|
29
|
+
it 'splits String into words (no ditigs)' do
|
30
|
+
array = %w(Lorem ipsum dolor sit amet)
|
31
|
+
string = 'Lorem 1 ipsum 23 dolor 456 sit 7890 amet'
|
32
|
+
|
33
|
+
string.words_without_digits.should eq(array)
|
34
|
+
end
|
35
|
+
end
|
26
36
|
end
|
@@ -2,102 +2,126 @@
|
|
2
2
|
require 'spec_helper'
|
3
3
|
|
4
4
|
describe LexicalUnits do
|
5
|
-
context
|
5
|
+
context '.words' do
|
6
6
|
before do
|
7
7
|
@array = %w(Lorem ipsum dolor sit amet)
|
8
8
|
end
|
9
9
|
|
10
|
-
it
|
11
|
-
text =
|
10
|
+
it 'splits text with whitespaces into words' do
|
11
|
+
text = 'Lorem ipsum dolor sit amet'
|
12
12
|
|
13
13
|
subject.words(text).should eq(@array)
|
14
14
|
end
|
15
15
|
|
16
|
-
it
|
17
|
-
text =
|
16
|
+
it 'splits text with comma, colon and semicolon into words' do
|
17
|
+
text = 'Lorem ipsum,dolor:sit;amet'
|
18
18
|
|
19
19
|
subject.words(text).should eq(@array)
|
20
20
|
end
|
21
21
|
|
22
|
-
it
|
23
|
-
text =
|
22
|
+
it 'splits text with dot, question mark and exclamation mark into words' do
|
23
|
+
text = 'Lorem ipsum.dolor?sit!amet'
|
24
24
|
|
25
25
|
subject.words(text).should eq(@array)
|
26
26
|
end
|
27
27
|
|
28
|
-
it
|
29
|
-
text =
|
28
|
+
it 'splits other text with whitespaces, comma and dot into words' do
|
29
|
+
text = 'Lorem ipsum dolor sit amet, consectetur adipiscing elit.'
|
30
30
|
array = %w(Lorem ipsum dolor sit amet consectetur adipiscing elit)
|
31
31
|
|
32
32
|
subject.words(text).should eq(array)
|
33
33
|
end
|
34
34
|
|
35
|
-
it
|
36
|
-
text =
|
35
|
+
it 'no splits text with hyphen into words' do
|
36
|
+
text = 'Lorem ipsum dolor-sit amet'
|
37
37
|
array = %w(Lorem ipsum dolor-sit amet)
|
38
38
|
|
39
39
|
subject.words(text).should eq(array)
|
40
40
|
end
|
41
41
|
|
42
|
-
it
|
43
|
-
text =
|
42
|
+
it 'splits text with slash into words' do
|
43
|
+
text = 'Lorem ipsum dolor sit/amet'
|
44
44
|
|
45
45
|
subject.words(text).should eq(@array)
|
46
46
|
end
|
47
47
|
|
48
|
-
it
|
49
|
-
text =
|
48
|
+
it 'splits text with round brackets into words' do
|
49
|
+
text = 'Lorem ipsum(dolor sit)amet'
|
50
50
|
|
51
51
|
subject.words(text).should eq(@array)
|
52
52
|
end
|
53
53
|
|
54
|
-
it
|
55
|
-
text =
|
54
|
+
it 'splits text with square brackets into words' do
|
55
|
+
text = 'Lorem ipsum dolor[sit]amet'
|
56
56
|
|
57
57
|
subject.words(text).should eq(@array)
|
58
58
|
end
|
59
59
|
|
60
|
-
it
|
61
|
-
text =
|
60
|
+
it 'splits text with pointy brackets into words' do
|
61
|
+
text = 'Lorem<ipsum dolor sit>amet'
|
62
62
|
|
63
63
|
subject.words(text).should eq(@array)
|
64
64
|
end
|
65
65
|
|
66
|
-
it
|
67
|
-
text =
|
66
|
+
it 'splits text with braces into words' do
|
67
|
+
text = 'Lorem ipsum{dolor}sit amet'
|
68
68
|
|
69
69
|
subject.words(text).should eq(@array)
|
70
70
|
end
|
71
71
|
|
72
|
-
it
|
73
|
-
text =
|
72
|
+
it 'splits text with vertical bar into words' do
|
73
|
+
text = 'Lorem ipsum|dolor sit amet'
|
74
74
|
|
75
75
|
subject.words(text).should eq(@array)
|
76
76
|
end
|
77
77
|
|
78
|
-
it
|
79
|
-
text =
|
78
|
+
it 'splits text with tilde into words' do
|
79
|
+
text = 'Lorem ipsum dolor~sit amet'
|
80
80
|
|
81
81
|
subject.words(text).should eq(@array)
|
82
82
|
end
|
83
83
|
|
84
84
|
# Spanish
|
85
|
-
it
|
86
|
-
text =
|
85
|
+
it 'splits text with inverted question and exclamation marks into words' do
|
86
|
+
text = 'Lorem¿ipsum?dolor¡sit!amet'
|
87
87
|
|
88
88
|
subject.words(text).should eq(@array)
|
89
89
|
end
|
90
90
|
|
91
|
-
it
|
92
|
-
text =
|
91
|
+
it 'splits text with equals sign into words' do
|
92
|
+
text = 'Lorem ipsum=dolor sit amet'
|
93
93
|
|
94
94
|
subject.words(text).should eq(@array)
|
95
95
|
end
|
96
96
|
|
97
|
-
it
|
97
|
+
it 'splits text with typewriter double quotes into words' do
|
98
98
|
text = %Q(Lorem"ipsum dolor"sit amet)
|
99
99
|
|
100
100
|
subject.words(text).should eq(@array)
|
101
101
|
end
|
102
|
+
|
103
|
+
it 'split text with non-English quotation marks into words' do
|
104
|
+
text = %Q(Lorem»ipsum dolor«sit amet)
|
105
|
+
|
106
|
+
subject.words(text).should eq(@array)
|
107
|
+
end
|
108
|
+
|
109
|
+
it "split text with 'at sign' (@) into words" do
|
110
|
+
text = %Q(Lorem@ipsum dolor sit amet)
|
111
|
+
|
112
|
+
subject.words(text).should eq(@array)
|
113
|
+
end
|
114
|
+
|
115
|
+
it "split text with 'number sign (#) into words" do
|
116
|
+
text = %Q(Lorem ipsum#dolor sit amet)
|
117
|
+
|
118
|
+
subject.words(text).should eq(@array)
|
119
|
+
end
|
120
|
+
|
121
|
+
it 'split text with plus (+) into words' do
|
122
|
+
text = %Q(Lorem+ipsum dolor+sit amet)
|
123
|
+
|
124
|
+
subject.words(text).should eq(@array)
|
125
|
+
end
|
102
126
|
end
|
103
127
|
end
|
@@ -2,13 +2,13 @@
|
|
2
2
|
require 'spec_helper'
|
3
3
|
|
4
4
|
describe LexicalUnits do
|
5
|
-
context
|
5
|
+
context '.words_without_digits' do
|
6
6
|
[
|
7
|
-
{text:
|
8
|
-
{text:
|
7
|
+
{ text: 'Lorem ipsum 12345', array: %w(Lorem ipsum) },
|
8
|
+
{ text: 'dolor 98765 sit amet.', array: %w(dolor sit amet) }
|
9
9
|
].each do |hash|
|
10
10
|
text, array = hash.values
|
11
|
-
it
|
11
|
+
it 'splits text into words without digits' do
|
12
12
|
subject.words_without_digits(text).should eq(array)
|
13
13
|
end
|
14
14
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: lexical_units
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.8
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Aleksander Malaszkiewicz
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2013-
|
11
|
+
date: 2013-08-15 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -70,7 +70,7 @@ files:
|
|
70
70
|
- spec/lexical_units/words_spec.rb
|
71
71
|
- spec/lexical_units/words_without_digits_spec.rb
|
72
72
|
- spec/spec_helper.rb
|
73
|
-
homepage:
|
73
|
+
homepage: https://github.com/fractalsoft/lexical_units
|
74
74
|
licenses:
|
75
75
|
- MIT
|
76
76
|
metadata: {}
|