ruby_nlp 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/README.md +88 -0
- data/lib/ruby_nlp/corpus.rb +38 -0
- data/lib/ruby_nlp/corpus_files/brown.rb +19 -0
- data/lib/ruby_nlp/ngram.rb +24 -0
- metadata +63 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: be92ab0a6b600f0f21aa400c53ee1cf110e41f6d
|
4
|
+
data.tar.gz: d661fcb65a06a5772b9805701d6f830997604a49
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: a0ebce070e03ffe77a815329f4a36cfb456d2ef640c8c08a8803331a71cf2e4c2d1b4df6c9cbc80047085b410d587b40326a84a956b2b26d704a2ad7a4e7834c
|
7
|
+
data.tar.gz: 4913d4f7711ff42eb7d58ddd5ee340c1f35baaeb768c5f1857765f3dbba9b6da9c98e60361ddb5f06e3576ac09e4cf754cdcaf3e63953ed6f6663a3cc8f3baec
|
data/README.md
ADDED
@@ -0,0 +1,88 @@
|
|
1
|
+
# Ruby NLP
|
2
|
+
|
3
|
+
This repository is a pure Ruby toolkit for NLP. It's the examples part of an ongoing series I'm writing on SitePoint.com, so watch this repository for updates!
|
4
|
+
|
5
|
+
## Installing
|
6
|
+
|
7
|
+
Simply install the gem:
|
8
|
+
|
9
|
+
```sh
|
10
|
+
gem install ruby_nlp
|
11
|
+
```
|
12
|
+
|
13
|
+
## What's In This Project?
|
14
|
+
|
15
|
+
### *n*-grams
|
16
|
+
|
17
|
+
A simple class to generate *n*-grams from an input string, either per word, per character, or on some custom splitter.
|
18
|
+
|
19
|
+
```ruby
|
20
|
+
require 'ruby_nlp/ngram'
|
21
|
+
ngram = Ngram.new("foo bar lux win")
|
22
|
+
|
23
|
+
ngram.unigrams # => [["foo"], ["bar"], ["lux"], ["win"]]
|
24
|
+
ngram.bigrams # => [["foo", "bar"], ["bar", "lux"], ["lux", "win"]]
|
25
|
+
ngram.trigrams # => [["foo", "bar", "lux"], ["bar", "lux", "win"]]
|
26
|
+
ngram.ngrams(4) # => [["foo", "bar", "lux", "win"]]
|
27
|
+
|
28
|
+
ngram = Ngram.new("abcde", regex: //)
|
29
|
+
ngram.unigrams # => [["a"], ["b"], ["c"], ["d"], ["e"]]
|
30
|
+
```
|
31
|
+
|
32
|
+
### Corpus Extractor
|
33
|
+
|
34
|
+
There is a basic corpus loader. It takes a glob for the files that are part of the corpus, and a class to delegate the sentence extraction to.
|
35
|
+
|
36
|
+
Currently, support for the following corpus types are included:
|
37
|
+
|
38
|
+
* Brown Corpus, in the `BrownCorpusFile` class at `ruby_nlp/corpus_files/brown`. This corpus currently strips the tagging to reveal only the raw sentences in the corpus.
|
39
|
+
|
40
|
+
An example of this in action:
|
41
|
+
|
42
|
+
```ruby
|
43
|
+
require 'ruby_nlp/corpus'
|
44
|
+
require 'ruby_nlp/corpus_files/brown'
|
45
|
+
|
46
|
+
corpus = Corpus.new('brown/c*', BrownCorpusFile)
|
47
|
+
corpus.trigrams # => One *really* big array of trigrams!
|
48
|
+
|
49
|
+
corpus.files # => `Array` of `BrownCorpusFile` instances, one for each file found.
|
50
|
+
corpus.sentences # => Flattened `Array` of sentences.
|
51
|
+
```
|
52
|
+
|
53
|
+
## Examples
|
54
|
+
|
55
|
+
Some examples are in the `examples` directory, one per article from SitePoint.com in this series.
|
56
|
+
|
57
|
+
## Future Work
|
58
|
+
|
59
|
+
* Support for other corpora.
|
60
|
+
* Preservation, filtering and searching of tags in corpora.
|
61
|
+
* Markov modelling and chaining.
|
62
|
+
* Text generation using Markov modelling.
|
63
|
+
* Part of speech tagging.
|
64
|
+
* Classification.
|
65
|
+
|
66
|
+
## License
|
67
|
+
|
68
|
+
The MIT License (MIT)
|
69
|
+
|
70
|
+
Copyright (c) 2013 Nathan Kleyn
|
71
|
+
|
72
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
73
|
+
of this software and associated documentation files (the "Software"), to deal
|
74
|
+
in the Software without restriction, including without limitation the rights
|
75
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
76
|
+
copies of the Software, and to permit persons to whom the Software is
|
77
|
+
furnished to do so, subject to the following conditions:
|
78
|
+
|
79
|
+
The above copyright notice and this permission notice shall be included in
|
80
|
+
all copies or substantial portions of the Software.
|
81
|
+
|
82
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
83
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
84
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
85
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
86
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
87
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
88
|
+
THE SOFTWARE.
|
@@ -0,0 +1,38 @@
|
|
1
|
+
require 'ruby_nlp/ngram'
|
2
|
+
|
3
|
+
class Corpus
|
4
|
+
def initialize(glob, klass)
|
5
|
+
@glob = glob
|
6
|
+
@klass = klass
|
7
|
+
end
|
8
|
+
|
9
|
+
def files
|
10
|
+
@files ||= Dir[@glob].map do |file|
|
11
|
+
@klass.new(file)
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
def sentences
|
16
|
+
files.map do |file|
|
17
|
+
file.sentences
|
18
|
+
end.flatten
|
19
|
+
end
|
20
|
+
|
21
|
+
def ngrams(n)
|
22
|
+
sentences.map do |sentence|
|
23
|
+
Ngram.new(sentence).ngrams(n)
|
24
|
+
end.flatten(1)
|
25
|
+
end
|
26
|
+
|
27
|
+
def unigrams
|
28
|
+
ngrams(1)
|
29
|
+
end
|
30
|
+
|
31
|
+
def bigrams
|
32
|
+
ngrams(2)
|
33
|
+
end
|
34
|
+
|
35
|
+
def trigrams
|
36
|
+
ngrams(3)
|
37
|
+
end
|
38
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
class BrownCorpusFile
|
2
|
+
def initialize(path)
|
3
|
+
@path = path
|
4
|
+
end
|
5
|
+
|
6
|
+
def sentences
|
7
|
+
@sentences ||= File.open(@path) do |file|
|
8
|
+
file.each_line.each_with_object([]) do |line, acc|
|
9
|
+
stripped_line = line.strip
|
10
|
+
|
11
|
+
unless stripped_line.nil? || stripped_line.empty?
|
12
|
+
acc << line.split(' ').map do |word|
|
13
|
+
word.split('/').first
|
14
|
+
end.join(' ')
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
class Ngram
|
2
|
+
attr_accessor :options
|
3
|
+
|
4
|
+
def initialize(target, options = { regex: / / })
|
5
|
+
@target = target
|
6
|
+
@options = options
|
7
|
+
end
|
8
|
+
|
9
|
+
def ngrams(n)
|
10
|
+
@target.split(@options[:regex]).each_cons(n).to_a
|
11
|
+
end
|
12
|
+
|
13
|
+
def unigrams
|
14
|
+
ngrams(1)
|
15
|
+
end
|
16
|
+
|
17
|
+
def bigrams
|
18
|
+
ngrams(2)
|
19
|
+
end
|
20
|
+
|
21
|
+
def trigrams
|
22
|
+
ngrams(3)
|
23
|
+
end
|
24
|
+
end
|
metadata
ADDED
@@ -0,0 +1,63 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: ruby_nlp
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Nathan Kleyn
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2013-09-14 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: rspec
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ~>
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: 2.14.1
|
20
|
+
type: :development
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ~>
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: 2.14.1
|
27
|
+
description: A simple NLP toolkit in pure Ruby. See http://github.com/nathankleyn/ruby_nlp
|
28
|
+
for more information.
|
29
|
+
email: nathan@nathankleyn.com
|
30
|
+
executables: []
|
31
|
+
extensions: []
|
32
|
+
extra_rdoc_files:
|
33
|
+
- README.md
|
34
|
+
files:
|
35
|
+
- README.md
|
36
|
+
- lib/ruby_nlp/corpus_files/brown.rb
|
37
|
+
- lib/ruby_nlp/corpus.rb
|
38
|
+
- lib/ruby_nlp/ngram.rb
|
39
|
+
homepage: http://github.com/nathankleyn/ruby_nlp
|
40
|
+
licenses:
|
41
|
+
- MIT
|
42
|
+
metadata: {}
|
43
|
+
post_install_message:
|
44
|
+
rdoc_options: []
|
45
|
+
require_paths:
|
46
|
+
- lib
|
47
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
48
|
+
requirements:
|
49
|
+
- - '>='
|
50
|
+
- !ruby/object:Gem::Version
|
51
|
+
version: '0'
|
52
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
53
|
+
requirements:
|
54
|
+
- - '>='
|
55
|
+
- !ruby/object:Gem::Version
|
56
|
+
version: '0'
|
57
|
+
requirements: []
|
58
|
+
rubyforge_project:
|
59
|
+
rubygems_version: 2.0.3
|
60
|
+
signing_key:
|
61
|
+
specification_version: 4
|
62
|
+
summary: A simple NLP toolkit in pure Ruby.
|
63
|
+
test_files: []
|