ngrams_search 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +15 -0
- data/README.rdoc +69 -0
- data/bin/ngrams_search +40 -0
- data/lib/ngrams_search.rb +45 -0
- metadata +62 -0
checksums.yaml
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
---
|
2
|
+
!binary "U0hBMQ==":
|
3
|
+
metadata.gz: !binary |-
|
4
|
+
ZTAzOTJmOGYyODNjZWNkZDgyNzkzYTAzNTU2MThlNzAxNGI2OTMxNw==
|
5
|
+
data.tar.gz: !binary |-
|
6
|
+
MDAyMzU2NzZkY2YxM2E1NGQ4NGU4YTJiZjdmMTJiZDIyZjBlNTZiZA==
|
7
|
+
!binary "U0hBNTEy":
|
8
|
+
metadata.gz: !binary |-
|
9
|
+
ZDU1NzZlMWZmNzc3N2U0MTMzNTU5YzQ4MzZmODVmZmE1MDBiZmU4OGQ2OGI3
|
10
|
+
Y2U2NWUwNzMzZWNlYzY2NTMwMGJlOTg4ZGUxNGQ0NzlkZTlkMDkzMTdiNDI1
|
11
|
+
ZTllZTA2MzdiYTQ2ZDI0NzlhOGUzOWIzY2Y4OTYzYTE1ZDU5Y2E=
|
12
|
+
data.tar.gz: !binary |-
|
13
|
+
N2U4NThhZTgxMDE3Yjg2ZWJhNzY3MWUyNmZjNjc4MTAxMjA1ZWI5NGZiYmM0
|
14
|
+
MGU1YzhmZTEyNzVkNTA4ZDJlMTQ3ZGJkNmMxZTRkMWJjNjcyYjA5MTQ4NjEw
|
15
|
+
NTA0MTJhNzg3YThjNWIwMDI1ZDhhNTMzYzAwYTQ4NDEzYmM1ZDM=
|
data/README.rdoc
ADDED
@@ -0,0 +1,69 @@
|
|
1
|
+
= ngrams_search
|
2
|
+
|
3
|
+
Author:: Elias Hasnat
|
4
|
+
|
5
|
+
= Description
|
6
|
+
|
7
|
+
ngrams_search is an extension of Ruby's core String class. It provides a String
|
8
|
+
object with the capability to produce n-grams.
|
9
|
+
|
10
|
+
From Wikipedia,
|
11
|
+
"In the fields of computational linguistics and probability, an n-gram is a
|
12
|
+
contiguous sequence of n items from a given sequence of text or speech. The
|
13
|
+
items in question can be phonemes, syllables, letters, words or base pairs
|
14
|
+
according to the application. n-grams are collected from a text or speech corpus.
|
15
|
+
|
16
|
+
An n-gram of size 1 is referred to as a "unigram"; size 2 is a "bigram"
|
17
|
+
(or, less commonly, a "digram"); size 3 is a "trigram"; size 4 is a "four-gram"
|
18
|
+
and size 5 or more is simply called an "n-gram"."
|
19
|
+
|
20
|
+
= Design
|
21
|
+
|
22
|
+
Instead of creating another namespace, this task seemed simple enough to merit
|
23
|
+
extending the String class. A string is a sequence of characters.
|
24
|
+
It can be words, binary code, sentences, paragraphs, etc. In short,
|
25
|
+
anything that you can store in a Ruby String object can be parsed into
|
26
|
+
n-grams of length n.
|
27
|
+
|
28
|
+
The main method being added to the String class is ngrams(). It produces an
|
29
|
+
array of n-grams from a Ruby String object.
|
30
|
+
|
31
|
+
For example, let s be a Ruby String object.
|
32
|
+
Then s.ngrams() returns array of n-grams of from s.
|
33
|
+
|
34
|
+
Tokenization of s is set to single characters by default.
|
35
|
+
For example, if
|
36
|
+
s = "Hello World!",
|
37
|
+
then the tokens of s are
|
38
|
+
["H","e","l","l","o"," ","W","o","r","l","d","!"].
|
39
|
+
By specifying a regular expression, you can tokenize the string s in many
|
40
|
+
different and useful ways.
|
41
|
+
|
42
|
+
If you set n = 4, then
|
43
|
+
s.ngrams = [["H", "e", "l", "l"],
|
44
|
+
["e", "l", "l", "o"],
|
45
|
+
["l", "l", "o", " "],
|
46
|
+
["l", "o", " ", "W"],
|
47
|
+
["o", " ", "W", "o"],
|
48
|
+
[" ", "W", "o", "r"],
|
49
|
+
["W", "o", "r", "l"],
|
50
|
+
["o", "r", "l", "d"],
|
51
|
+
["r", "l", "d", "!"]].
|
52
|
+
|
53
|
+
Each item in the s.ngrams array can joined but doesn't need to be.
|
54
|
+
If you want to join them, normally you can do so easily if it is text.
|
55
|
+
Be careful if you are trying to join n-grams with non-printable characters.
|
56
|
+
|
57
|
+
You can google "n-grams" to get more information about how n-grams are useful.
|
58
|
+
|
59
|
+
= Installation
|
60
|
+
gem install ngrams_search
|
61
|
+
|
62
|
+
= Usage
|
63
|
+
|
64
|
+
You can simply run the executable and provide input via STDIN.
|
65
|
+
ngrams_search
|
66
|
+
|
67
|
+
You can also provide input via one or more filenames
|
68
|
+
ngrams_search [FILES]
|
69
|
+
|
data/bin/ngrams_search
ADDED
@@ -0,0 +1,40 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'ruby_cli'
|
4
|
+
require 'ngrams_search'
|
5
|
+
|
6
|
+
class App
|
7
|
+
include RubyCLI
|
8
|
+
|
9
|
+
def initialize_command_options() @options = {:regex => //, :n => 2} end
|
10
|
+
|
11
|
+
def define_command_option_parsing
|
12
|
+
@opt_parser.on('-n', '--n NUM', Integer, 'set length n for n-grams') do |n|
|
13
|
+
@options[:n] = n
|
14
|
+
end
|
15
|
+
@opt_parser.on('-r', '--regex "REGEX"', Regexp, 'set regex to split string into tokens') do |r|
|
16
|
+
@options[:regex] = r
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
def command
|
21
|
+
# If arguments were provided, then they have to be names of files.
|
22
|
+
# These files will be handled using Ruby's ARGF builtin variable.
|
23
|
+
# If arguments are not filenames, then this application will produce a
|
24
|
+
# a runtime error informing the user that the given file could not be opened.
|
25
|
+
|
26
|
+
# ARGF is a stream designed for use in scripts that process files given as
|
27
|
+
# command-line arguments or passed in via STDIN.
|
28
|
+
# The arguments passed to your script are stored in the ARGV Array,
|
29
|
+
# one argument per element. ARGF assumes that any arguments that aren’t
|
30
|
+
# filenames have been removed from ARGV.
|
31
|
+
text = ARGF.read
|
32
|
+
text.ngrams(@options).each { |ngram| puts ngram.inspect }
|
33
|
+
end
|
34
|
+
|
35
|
+
end
|
36
|
+
|
37
|
+
app = App.new(ARGV, __FILE__)
|
38
|
+
app.run
|
39
|
+
|
40
|
+
|
@@ -0,0 +1,45 @@
|
|
1
|
+
# This is an extension of Ruby's core String class.
|
2
|
+
# It add methods to extract a set of n-grams from a string.
|
3
|
+
# Typically, the most used set of n-grams are unigrams, bigrams, and trigrams;
|
4
|
+
# sets of n-grams of length 1, 2, and 3, respectively.
|
5
|
+
class String
|
6
|
+
|
7
|
+
# An n-gram is a sequence of units of text of length n, where those units are
|
8
|
+
# typically single characters or words delimited by space characters.
|
9
|
+
# However, a token could also be a fixed length character sequence, strings
|
10
|
+
# with embedded spaces, etc. depending on the intended application.
|
11
|
+
# Typically, n-grams are formed of contiguous tokens.
|
12
|
+
#
|
13
|
+
# This function splits the string into a set of n-grams.
|
14
|
+
# The default regex used tokenizes the string into characters.
|
15
|
+
#
|
16
|
+
# Regex Examples:
|
17
|
+
# // => splits into characters
|
18
|
+
# /\s+/ => splits into words delimited by one or more space characters
|
19
|
+
# /\n+/ => splits into lines delimted by one or more newline characters
|
20
|
+
#
|
21
|
+
def ngrams(options = {:regex=>//, :n=>2})
|
22
|
+
ngrams = []
|
23
|
+
tokens = self.split(options[:regex])
|
24
|
+
max_pos = tokens.length - options[:n]
|
25
|
+
for i in 0..max_pos
|
26
|
+
ngrams.push(tokens[i..i+(options[:n]-1)])
|
27
|
+
end
|
28
|
+
ngrams
|
29
|
+
end
|
30
|
+
|
31
|
+
# This function splits the string into unigrams,
|
32
|
+
# tokenizes into chars by default
|
33
|
+
def unigrams(regex = //) ngrams({:regex => regex, :n => 1}); end
|
34
|
+
|
35
|
+
# This function splits the string into bigrams
|
36
|
+
# tokenizes into chars by default
|
37
|
+
def bigrams(regex = //) ngrams({:regex => regex, :n => 2}); end
|
38
|
+
|
39
|
+
# This function splits the string into trigrams
|
40
|
+
# tokenizes into chars by default
|
41
|
+
def trigrams(regex = //) ngrams({:regex => regex, :n => 3}); end
|
42
|
+
|
43
|
+
end #class String
|
44
|
+
|
45
|
+
|
metadata
ADDED
@@ -0,0 +1,62 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: ngrams_search
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Elias Hasnat
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2013-09-03 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: ruby_cli
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ! '>='
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: 0.2.0
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ! '>='
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: 0.2.0
|
27
|
+
description: n-grams string search
|
28
|
+
email: android.hasnat@gmail.com
|
29
|
+
executables:
|
30
|
+
- ngrams_search
|
31
|
+
extensions: []
|
32
|
+
extra_rdoc_files: []
|
33
|
+
files:
|
34
|
+
- lib/ngrams_search.rb
|
35
|
+
- bin/ngrams_search
|
36
|
+
- README.rdoc
|
37
|
+
homepage: http://github.com/claymodel/telephony/ngrams_search
|
38
|
+
licenses: []
|
39
|
+
metadata: {}
|
40
|
+
post_install_message:
|
41
|
+
rdoc_options: []
|
42
|
+
require_paths:
|
43
|
+
- lib
|
44
|
+
- bin
|
45
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
46
|
+
requirements:
|
47
|
+
- - ! '>='
|
48
|
+
- !ruby/object:Gem::Version
|
49
|
+
version: '0'
|
50
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ! '>='
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
requirements: []
|
56
|
+
rubyforge_project:
|
57
|
+
rubygems_version: 2.0.7
|
58
|
+
signing_key:
|
59
|
+
specification_version: 4
|
60
|
+
summary: Search string using n-grams
|
61
|
+
test_files: []
|
62
|
+
has_rdoc:
|