ruby_ngrams 0.0.1 → 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (3) hide show
  1. data/bin/ruby_ngrams +58 -0
  2. data/lib/ruby_ngrams.rb +47 -0
  3. metadata +5 -5
data/bin/ruby_ngrams CHANGED
@@ -0,0 +1,58 @@
1
+ #!/usr/bin/ruby -w
2
+
3
+ require 'ruby_cli'
4
+ require 'ruby_ngrams'
5
+
6
+ class App
7
+ include RubyCLI
8
+
9
+ def define_command_options() @options = {:regex => //, :n => 2} end
10
+
11
+ # Define an OptionParser to parse the command line
12
+ def parse_options?
13
+ #configure an OptionParser
14
+ @opt_parser = OptionParser.new do |opts|
15
+ opts.banner = "Usage: #{__FILE__} [OPTIONS]... [FILE]..."
16
+ opts.separator ""
17
+ opts.separator "Specific options:"
18
+ opts.on('-h', '--help', 'displays help information') do
19
+ @default_options[:help] = true
20
+ output_help
21
+ end
22
+ opts.on('-V','--verbose','Run verbosely') do
23
+ @default_options[:verbose] = true
24
+ end
25
+ opts.on('-n', '--n NUM', Integer, 'set length n for n-grams') do |n|
26
+ @options[:n] = n
27
+ end
28
+ opts.on('-r', '--regex REGEX', Regexp, 'set regex to split string into tokens') do |r|
29
+ @options[:regex] = r
30
+ end
31
+ end
32
+ @opt_parser.parse!(@default_argv) rescue return false
33
+ true
34
+ end
35
+
36
+ def command
37
+ # If arguments were provided, then they have to be names of files.
38
+ # These files will be handled using Ruby's ARGF builtin variable.
39
+ # If arguments are not filenames, then this application will produce a
40
+ # a runtime error informing the user that the given file could not be opened.
41
+
42
+ # ARGF is a stream designed for use in scripts that process files given as
43
+ # command-line arguments or passed in via STDIN.
44
+ # The arguments passed to your script are stored in the ARGV Array,
45
+ # one argument per element. ARGF assumes that any arguments that aren’t
46
+ # filenames have been removed from ARGV.
47
+ text = ARGF.read
48
+ text.ngrams(@options).each { |ngram| puts ngram.inspect }
49
+ end
50
+
51
+ end
52
+
53
+
54
+ if __FILE__ == $0
55
+ app = App.new(ARGV)
56
+ app.run
57
+ end
58
+
data/lib/ruby_ngrams.rb CHANGED
@@ -0,0 +1,47 @@
1
+ # This is an extension of Ruby's core String class.
2
+ # It add methods to extract a set of n-grams from a string.
3
+ # Typically, the most used set of n-grams are unigrams, bigrams, and trigrams;
4
+ # sets of n-grams of length 1, 2, and 3, respectively.
5
+ class String
6
+
7
+ # An n-gram is a sequence of units of text of length n, where those units are
8
+ # typically single characters or words delimited by space characters.
9
+ # However, a token could also be a fixed length character sequence, strings
10
+ # with embedded spaces, etc. depending on the intended application.
11
+ # Typically, n-grams are formed of contiguous tokens.
12
+ #
13
+ # This function splits the string into a set of n-grams.
14
+ # The default regex used tokenizes the string into characters.
15
+ #
16
+ # Regex Examples:
17
+ # // => splits into characters
18
+ # /\s+/ => splits into words delimited by one or more space characters
19
+ # /\n+/ => splits into lines delimted by one or more newline characters
20
+ #
21
+ # TODO: Determine efficiency of this function on long strings.
22
+ # TODO: Determine how well this works on strings in binary format.
23
+ def ngrams(options = {:regex=>//, :n=>2})
24
+ ngrams = []
25
+ tokens = self.split(options[:regex])
26
+ max_pos = tokens.length - options[:n]
27
+ for i in 0..max_pos
28
+ ngrams.push(tokens[i..i+(options[:n]-1)])
29
+ end
30
+ ngrams
31
+ end
32
+
33
+ # This function splits the string into unigrams,
34
+ # tokenizes into chars by default
35
+ def unigrams(regex = //) ngrams({:regex => regex, :n => 1}); end
36
+
37
+ # This function splits the string into bigrams
38
+ # tokenizes into chars by default
39
+ def bigrams(regex = //) ngrams({:regex => regex, :n => 2}); end
40
+
41
+ # This function splits the string into trigrams
42
+ # tokenizes into chars by default
43
+ def trigrams(regex = //) ngrams({:regex => regex, :n => 3}); end
44
+
45
+ end #class String
46
+
47
+
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ruby_ngrams
3
3
  version: !ruby/object:Gem::Version
4
- hash: 29
4
+ hash: 27
5
5
  prerelease: false
6
6
  segments:
7
7
  - 0
8
8
  - 0
9
- - 1
10
- version: 0.0.1
9
+ - 2
10
+ version: 0.0.2
11
11
  platform: ruby
12
12
  authors:
13
13
  - Martin Velez
@@ -15,11 +15,11 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2011-10-19 00:00:00 -07:00
18
+ date: 2011-11-11 00:00:00 -08:00
19
19
  default_executable:
20
20
  dependencies: []
21
21
 
22
- description: A simple Ruby library to parse a string into n-grams
22
+ description: A simple extension of the Ruby core string class to parse a string into n-grams
23
23
  email: mvelez999@gmail.com
24
24
  executables:
25
25
  - ruby_ngrams