ruby_ngrams 0.0.1 → 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- data/bin/ruby_ngrams +58 -0
- data/lib/ruby_ngrams.rb +47 -0
- metadata +5 -5
data/bin/ruby_ngrams
CHANGED
@@ -0,0 +1,58 @@
|
|
1
|
+
#!/usr/bin/ruby -w
|
2
|
+
|
3
|
+
require 'ruby_cli'
|
4
|
+
require 'ruby_ngrams'
|
5
|
+
|
6
|
+
class App
|
7
|
+
include RubyCLI
|
8
|
+
|
9
|
+
def define_command_options() @options = {:regex => //, :n => 2} end
|
10
|
+
|
11
|
+
# Define an OptionParser to parse the command line
|
12
|
+
def parse_options?
|
13
|
+
#configure an OptionParser
|
14
|
+
@opt_parser = OptionParser.new do |opts|
|
15
|
+
opts.banner = "Usage: #{__FILE__} [OPTIONS]... [FILE]..."
|
16
|
+
opts.separator ""
|
17
|
+
opts.separator "Specific options:"
|
18
|
+
opts.on('-h', '--help', 'displays help information') do
|
19
|
+
@default_options[:help] = true
|
20
|
+
output_help
|
21
|
+
end
|
22
|
+
opts.on('-V','--verbose','Run verbosely') do
|
23
|
+
@default_options[:verbose] = true
|
24
|
+
end
|
25
|
+
opts.on('-n', '--n NUM', Integer, 'set length n for n-grams') do |n|
|
26
|
+
@options[:n] = n
|
27
|
+
end
|
28
|
+
opts.on('-r', '--regex REGEX', Regexp, 'set regex to split string into tokens') do |r|
|
29
|
+
@options[:regex] = r
|
30
|
+
end
|
31
|
+
end
|
32
|
+
@opt_parser.parse!(@default_argv) rescue return false
|
33
|
+
true
|
34
|
+
end
|
35
|
+
|
36
|
+
def command
|
37
|
+
# If arguments were provided, then they have to be names of files.
|
38
|
+
# These files will be handled using Ruby's ARGF builtin variable.
|
39
|
+
# If arguments are not filenames, then this application will produce a
|
40
|
+
# a runtime error informing the user that the given file could not be opened.
|
41
|
+
|
42
|
+
# ARGF is a stream designed for use in scripts that process files given as
|
43
|
+
# command-line arguments or passed in via STDIN.
|
44
|
+
# The arguments passed to your script are stored in the ARGV Array,
|
45
|
+
# one argument per element. ARGF assumes that any arguments that aren’t
|
46
|
+
# filenames have been removed from ARGV.
|
47
|
+
text = ARGF.read
|
48
|
+
text.ngrams(@options).each { |ngram| puts ngram.inspect }
|
49
|
+
end
|
50
|
+
|
51
|
+
end
|
52
|
+
|
53
|
+
|
54
|
+
if __FILE__ == $0
|
55
|
+
app = App.new(ARGV)
|
56
|
+
app.run
|
57
|
+
end
|
58
|
+
|
data/lib/ruby_ngrams.rb
CHANGED
@@ -0,0 +1,47 @@
|
|
1
|
+
# This is an extension of Ruby's core String class.
|
2
|
+
# It add methods to extract a set of n-grams from a string.
|
3
|
+
# Typically, the most used set of n-grams are unigrams, bigrams, and trigrams;
|
4
|
+
# sets of n-grams of length 1, 2, and 3, respectively.
|
5
|
+
class String
|
6
|
+
|
7
|
+
# An n-gram is a sequence of units of text of length n, where those units are
|
8
|
+
# typically single characters or words delimited by space characters.
|
9
|
+
# However, a token could also be a fixed length character sequence, strings
|
10
|
+
# with embedded spaces, etc. depending on the intended application.
|
11
|
+
# Typically, n-grams are formed of contiguous tokens.
|
12
|
+
#
|
13
|
+
# This function splits the string into a set of n-grams.
|
14
|
+
# The default regex used tokenizes the string into characters.
|
15
|
+
#
|
16
|
+
# Regex Examples:
|
17
|
+
# // => splits into characters
|
18
|
+
# /\s+/ => splits into words delimited by one or more space characters
|
19
|
+
# /\n+/ => splits into lines delimted by one or more newline characters
|
20
|
+
#
|
21
|
+
# TODO: Determine efficiency of this function on long strings.
|
22
|
+
# TODO: Determine how well this works on strings in binary format.
|
23
|
+
def ngrams(options = {:regex=>//, :n=>2})
|
24
|
+
ngrams = []
|
25
|
+
tokens = self.split(options[:regex])
|
26
|
+
max_pos = tokens.length - options[:n]
|
27
|
+
for i in 0..max_pos
|
28
|
+
ngrams.push(tokens[i..i+(options[:n]-1)])
|
29
|
+
end
|
30
|
+
ngrams
|
31
|
+
end
|
32
|
+
|
33
|
+
# This function splits the string into unigrams,
|
34
|
+
# tokenizes into chars by default
|
35
|
+
def unigrams(regex = //) ngrams({:regex => regex, :n => 1}); end
|
36
|
+
|
37
|
+
# This function splits the string into bigrams
|
38
|
+
# tokenizes into chars by default
|
39
|
+
def bigrams(regex = //) ngrams({:regex => regex, :n => 2}); end
|
40
|
+
|
41
|
+
# This function splits the string into trigrams
|
42
|
+
# tokenizes into chars by default
|
43
|
+
def trigrams(regex = //) ngrams({:regex => regex, :n => 3}); end
|
44
|
+
|
45
|
+
end #class String
|
46
|
+
|
47
|
+
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: ruby_ngrams
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 27
|
5
5
|
prerelease: false
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 0
|
9
|
-
-
|
10
|
-
version: 0.0.
|
9
|
+
- 2
|
10
|
+
version: 0.0.2
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Martin Velez
|
@@ -15,11 +15,11 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2011-
|
18
|
+
date: 2011-11-11 00:00:00 -08:00
|
19
19
|
default_executable:
|
20
20
|
dependencies: []
|
21
21
|
|
22
|
-
description: A simple Ruby
|
22
|
+
description: A simple extension of the Ruby core string class to parse a string into n-grams
|
23
23
|
email: mvelez999@gmail.com
|
24
24
|
executables:
|
25
25
|
- ruby_ngrams
|