greeb 0.1.0.rc1 → 0.1.0.rc3

Sign up to get free protection for your applications and to get access to all the features.
data/.gitignore CHANGED
@@ -25,10 +25,17 @@ nbproject
25
25
  ## RVM
26
26
  .rvmrc
27
27
 
28
+ ## RUBINIUS
29
+ .rbx
30
+
28
31
  ## BUNDLER
29
32
  .bundle
30
33
  Gemfile.lock
31
34
 
35
+ ## YARD
36
+ .yardoc
37
+ doc
38
+
32
39
  ## PROJECT::GENERAL
33
40
  coverage
34
41
  pkg
data/.travis.yml CHANGED
@@ -4,4 +4,3 @@ branches:
4
4
  - master
5
5
  rvm:
6
6
  - 1.9.3
7
- - rbx-19mode
data/LICENSE CHANGED
@@ -1,4 +1,4 @@
1
- Copyright (c) 2010-2012 Dmitry A. Ustalov
1
+ Copyright (c) 2010-2012 Dmitry Ustalov
2
2
 
3
3
  Permission is hereby granted, free of charge, to any person obtaining
4
4
  a copy of this software and associated documentation files (the
data/README.md CHANGED
@@ -79,7 +79,8 @@ such as sentence detection tasks:
79
79
 
80
80
  ```ruby
81
81
  text = 'Hello! How are you?'
82
- pp Greeb::Segmentator.new(Greeb::Tokenizer.new(text))
82
+ tokenizer = Greeb::Tokenizer.new(text)
83
+ pp Greeb::Segmentator.new(tokenizer).sentences
83
84
  =begin
84
85
  #<SortedSet: {#<struct Greeb::Entity from=0, to=6, type=:sentence>,
85
86
  #<struct Greeb::Entity from=7, to=19, type=:sentence>}>
@@ -91,8 +92,8 @@ segmentator:
91
92
 
92
93
  ```ruby
93
94
  text = 'Hello! How are you?'
94
- segmentator = Greeb::Segmentator.new(Greeb::Tokenizer.new(text))
95
- sentences = segmentator.sentences
95
+ tokenizer = Greeb::Tokenizer.new(text)
96
+ sentences = Greeb::Segmentator.new(tokenizer).sentences
96
97
  pp segmentator.extract(*sentences)
97
98
  =begin
98
99
  {#<struct Greeb::Entity from=0, to=6, type=:sentence>=>
@@ -135,6 +136,6 @@ systematic and awesome.
135
136
 
136
137
  ## Copyright
137
138
 
138
- Copyright (c) 2010-2012 [Dmitry A. Ustalov]. See LICENSE for details.
139
+ Copyright (c) 2010-2012 [Dmitry Ustalov]. See LICENSE for details.
139
140
 
140
- [Dmitry A. Ustalov]: http://eveel.ru
141
+ [Dmitry Ustalov]: http://eveel.ru
data/lib/greeb.rb CHANGED
@@ -12,6 +12,7 @@ require 'greeb/version'
12
12
  # `:break` for line endings.
13
13
  #
14
14
  class Greeb::Entity < Struct.new(:from, :to, :type)
15
+ # @private
15
16
  def <=> other
16
17
  if (comparison = self.from <=> other.from) == 0
17
18
  self.to <=> other.to
@@ -21,5 +22,6 @@ class Greeb::Entity < Struct.new(:from, :to, :type)
21
22
  end
22
23
  end
23
24
 
25
+ require 'greeb/strscan'
24
26
  require 'greeb/tokenizer'
25
27
  require 'greeb/segmentator'
@@ -14,7 +14,7 @@ class Greeb::Segmentator
14
14
  # Create a new instance of {Greeb::Segmentator}.
15
15
  #
16
16
  # @param tokenizer_or_tokens [Greeb::Tokenizer,Set] an instance of
17
- # Greeb::Tokenizer or set of its results.
17
+ # {Greeb::Tokenizer} or set of its results.
18
18
  #
19
19
  def initialize tokenizer_or_tokens
20
20
  @tokens = if tokenizer_or_tokens.is_a? Greeb::Tokenizer
@@ -38,7 +38,7 @@ class Greeb::Segmentator
38
38
  # @param sentences [Array<Greeb::Entity>] a list of sentences.
39
39
  #
40
40
  # @return [Hash<Greeb::Entity, Array<Greeb::Entity>>] a hash with
41
- # sentences as keys and tokens arrays as values.
41
+ # sentences as keys and tokens arrays as values.
42
42
  #
43
43
  def extract *sentences
44
44
  Hash[
@@ -0,0 +1,20 @@
1
+ # encoding: utf-8
2
+
3
+ require 'strscan'
4
+
5
+ # {StringScanner} provides for lexical scanning operations on a String.
6
+ # This implementation covers the byte slicing problem in the standard
7
+ # library's implementation.
8
+ #
9
+ class Greeb::StringScanner < StringScanner
10
+ # Returns the character position of the scan pointer. In the `reset`
11
+ # position, this value is zero. In the `terminated` position
12
+ # (i.e. the string is exhausted), this value is the length
13
+ # of the string.
14
+ #
15
+ # @return [Fixnum] the character position of the scan pointer.
16
+ #
17
+ def char_pos
18
+ string.byteslice(0...pos).length
19
+ end
20
+ end
@@ -1,6 +1,5 @@
1
1
  # encoding: utf-8
2
2
 
3
- require 'strscan'
4
3
  require 'set'
5
4
 
6
5
  # Greeb's tokenization facilities. Use 'em with love.
@@ -61,7 +60,7 @@ class Greeb::Tokenizer
61
60
  # @return [nil] nothing unless exception is raised.
62
61
  #
63
62
  def tokenize!
64
- @scanner = StringScanner.new(text)
63
+ @scanner = Greeb::StringScanner.new(text)
65
64
  @tokens = SortedSet.new
66
65
  while !scanner.eos?
67
66
  parse! LETTERS, :letter or
@@ -82,13 +81,16 @@ class Greeb::Tokenizer
82
81
  #
83
82
  # @param pattern [Regexp] a regular expression to extract the token.
84
83
  # @param type [Symbol] a symbol that represents the necessary token
85
- # type.
84
+ # type.
86
85
  #
87
86
  # @return [Set<Greeb::Entity>] the modified set of extracted tokens.
88
87
  #
89
88
  def parse! pattern, type
90
89
  return false unless token = scanner.scan(pattern)
91
- @tokens << Greeb::Entity.new(scanner.pos - token.length, scanner.pos, type)
90
+ position = scanner.char_pos
91
+ @tokens << Greeb::Entity.new(position - token.length,
92
+ position,
93
+ type)
92
94
  end
93
95
 
94
96
  # Try to parse one small piece of text that is covered by pattern
@@ -97,13 +99,13 @@ class Greeb::Tokenizer
97
99
  #
98
100
  # @param pattern [Regexp] a regular expression to extract the token.
99
101
  # @param type [Symbol] a symbol that represents the necessary token
100
- # type.
102
+ # type.
101
103
  #
102
104
  # @return [Set<Greeb::Entity>] the modified set of extracted tokens.
103
105
  #
104
106
  def split_parse! pattern, type
105
107
  return false unless token = scanner.scan(pattern)
106
- position = scanner.pos - token.length
108
+ position = scanner.char_pos - token.length
107
109
  token.scan(/((.|\n)\2*)/).map(&:first).inject(position) do |before, s|
108
110
  @tokens << Greeb::Entity.new(before, before + s.length, type)
109
111
  before + s.length
data/lib/greeb/version.rb CHANGED
@@ -5,5 +5,5 @@
5
5
  module Greeb
6
6
  # Version of Greeb.
7
7
  #
8
- VERSION = '0.1.0.rc1'
8
+ VERSION = '0.1.0.rc3'
9
9
  end
@@ -86,6 +86,22 @@ module Greeb
86
86
  Entity.new(4, 7, :integer)])
87
87
  )
88
88
  end
89
+
90
+ it 'can deal with Russian language' do
91
+ Tokenizer.new('Братишка, я тебе покушать принёс!').tokens.must_equal(
92
+ SortedSet.new([Entity.new(0, 8, :letter),
93
+ Entity.new(8, 9, :spunct),
94
+ Entity.new(9, 10, :separ),
95
+ Entity.new(10, 11, :letter),
96
+ Entity.new(11, 12, :separ),
97
+ Entity.new(12, 16, :letter),
98
+ Entity.new(16, 17, :separ),
99
+ Entity.new(17, 25, :letter),
100
+ Entity.new(25, 26, :separ),
101
+ Entity.new(26, 32, :letter),
102
+ Entity.new(32, 33, :punct)])
103
+ )
104
+ end
89
105
  end
90
106
  end
91
107
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: greeb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0.rc1
4
+ version: 0.1.0.rc3
5
5
  prerelease: 6
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-07-08 00:00:00.000000000 Z
12
+ date: 2012-07-10 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rake
@@ -92,6 +92,7 @@ files:
92
92
  - greeb.gemspec
93
93
  - lib/greeb.rb
94
94
  - lib/greeb/segmentator.rb
95
+ - lib/greeb/strscan.rb
95
96
  - lib/greeb/tokenizer.rb
96
97
  - lib/greeb/version.rb
97
98
  - spec/segmentator_spec.rb
@@ -111,7 +112,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
111
112
  version: '0'
112
113
  segments:
113
114
  - 0
114
- hash: -4603914053803130942
115
+ hash: -2527935574265859361
115
116
  required_rubygems_version: !ruby/object:Gem::Requirement
116
117
  none: false
117
118
  requirements: