rb_lib_text 0.0.1 → 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +1 -1
- data/lib/rb_lib_text.rb +6 -5
- data/lib/rb_lib_text/version.rb +1 -1
- metadata +14 -14
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 31fba2668d0fe67e413f836abdc9a9d25047463b
|
4
|
+
data.tar.gz: 42475fec2a1e786f11f4fa816e4ecb547ef7b513
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a1087345a482f63dad742161c0aa6f3717de82a9e7d77082b112c15a574bd95cfea13c0a111722d497e2114fd4ff4875022e1353f7cd7743433d426ad7b0566f
|
7
|
+
data.tar.gz: da9b9e08ebab7b744c63b897e0fa42bd0503b3c15ab12d768b33f3285a8a9448d475fdf79cdde22f5b571045a21cee801e13f7658f3ff70232bed64e7576cd14
|
data/README.md
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
# RbLibText
|
2
2
|
A little text processing library for Ruby.
|
3
3
|
|
4
|
-
[![Build Status](https://travis-ci.
|
4
|
+
[![Build Status](https://travis-ci.org/peoplepattern/rb-lib-text.svg)](https://travis-ci.org/peoplepattern/rb-lib-text)
|
5
5
|
|
6
6
|
## Overview
|
7
7
|
The tokenization has been tuned to work well with text conventions commonly used in social media such as Twitter, and supports URLs, hashtags, emails and @-mentions cleanly.
|
data/lib/rb_lib_text.rb
CHANGED
@@ -14,23 +14,24 @@ module RbLibText
|
|
14
14
|
tags_contractions: '[\w]+[\'‘’][\w]+', #don't split don't and can't and it's
|
15
15
|
emails: '[\w\.\d]+@[\w\.\d]+\.[\w]+', #catch email addresses
|
16
16
|
urls: 'https?://[-_/~%\w\d\.]*[_/~\w\d]', #Catch url addresses
|
17
|
-
#sideways_text_emoji: '>?[:;=][\'\-D\)\]\(\[pPdoO/\*3\\]+',
|
17
|
+
# sideways_text_emoji: '>?[:;=][\'\-D\)\]\(\[pPdoO/\*3\\]+',
|
18
|
+
sideways_text_emoji: '>?[:;=8][\'\-D\)\(3DdPpOo\*\/]+',
|
18
19
|
ellipses: '\.{3}',
|
19
20
|
en_em_dash: '-{2,3}', #Catch en and em dashes
|
20
21
|
slashes: '[\w]+(?:[/\-][\w]+)+', #Grammatical / -
|
21
22
|
punct: '[\"“”‘’\'\\.\\?!…,:;»«\(\)]', #punctuation to split on
|
22
23
|
tags_mentions: '[\w#@\d%$\u00B0]+', #Group all of these things together
|
24
|
+
hearts: '<+\/?3', # <3
|
23
25
|
emoji_block0: '[\U00002600-\U000027BF]',
|
24
26
|
emoji_block1: '[\U0001f300-\U0001f64F]',
|
25
27
|
emoji_block2: '[\U0001f680-\U0001f6FF]',
|
26
|
-
|
27
|
-
other_punct: '[\u2014\u2013]',
|
28
|
+
other_punct: '[\u2014\u2013]',
|
28
29
|
all_other: '[^\s]', #Split any other weird chars that may have been missed
|
29
30
|
}
|
30
|
-
|
31
|
+
|
31
32
|
return Regexp.union(patterns.values.map{|value| Regexp.new(value)})
|
32
33
|
end
|
33
|
-
|
34
|
+
|
34
35
|
def self.tokens(text)
|
35
36
|
text = text.gsub("\u2026", "...")
|
36
37
|
text = text.gsub(/\.{2,}/, "...")
|
data/lib/rb_lib_text/version.rb
CHANGED
metadata
CHANGED
@@ -1,55 +1,55 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rb_lib_text
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- johnnytomcat
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2016-02-24 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
|
-
- -
|
17
|
+
- - '>='
|
18
18
|
- !ruby/object:Gem::Version
|
19
19
|
version: 1.8.4
|
20
20
|
type: :development
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
|
-
- -
|
24
|
+
- - '>='
|
25
25
|
- !ruby/object:Gem::Version
|
26
26
|
version: 1.8.4
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
28
|
name: rake
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
30
30
|
requirements:
|
31
|
-
- -
|
31
|
+
- - ~>
|
32
32
|
- !ruby/object:Gem::Version
|
33
33
|
version: '10.0'
|
34
34
|
type: :development
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
|
-
- -
|
38
|
+
- - ~>
|
39
39
|
- !ruby/object:Gem::Version
|
40
40
|
version: '10.0'
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
42
|
name: rspec
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
44
44
|
requirements:
|
45
|
-
- -
|
45
|
+
- - '>='
|
46
46
|
- !ruby/object:Gem::Version
|
47
47
|
version: '0'
|
48
48
|
type: :development
|
49
49
|
prerelease: false
|
50
50
|
version_requirements: !ruby/object:Gem::Requirement
|
51
51
|
requirements:
|
52
|
-
- -
|
52
|
+
- - '>='
|
53
53
|
- !ruby/object:Gem::Version
|
54
54
|
version: '0'
|
55
55
|
description: The tokenization has been tuned to work well with text conventions commonly
|
@@ -61,9 +61,9 @@ executables: []
|
|
61
61
|
extensions: []
|
62
62
|
extra_rdoc_files: []
|
63
63
|
files:
|
64
|
-
-
|
65
|
-
-
|
66
|
-
-
|
64
|
+
- .gitignore
|
65
|
+
- .rspec
|
66
|
+
- .travis.yml
|
67
67
|
- CODE_OF_CONDUCT.md
|
68
68
|
- Gemfile
|
69
69
|
- LICENSE.txt
|
@@ -85,17 +85,17 @@ require_paths:
|
|
85
85
|
- lib
|
86
86
|
required_ruby_version: !ruby/object:Gem::Requirement
|
87
87
|
requirements:
|
88
|
-
- -
|
88
|
+
- - '>='
|
89
89
|
- !ruby/object:Gem::Version
|
90
90
|
version: '0'
|
91
91
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
92
92
|
requirements:
|
93
|
-
- -
|
93
|
+
- - '>='
|
94
94
|
- !ruby/object:Gem::Version
|
95
95
|
version: '0'
|
96
96
|
requirements: []
|
97
97
|
rubyforge_project:
|
98
|
-
rubygems_version: 2.
|
98
|
+
rubygems_version: 2.0.14
|
99
99
|
signing_key:
|
100
100
|
specification_version: 4
|
101
101
|
summary: A little text processing library for Ruby.
|