greeb 0.2.2.1 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 2e350306a429635314d151614f03ef4a0edd6887
4
- data.tar.gz: 99aa7bb04b04589adbb1f62aacf2733071d14b53
3
+ metadata.gz: 07673b32254cd2b0ab0edf0664fa59e46231dbe3
4
+ data.tar.gz: f8eaac92c0fd4d7dda99c4441117b5e2b34c5caa
5
5
  SHA512:
6
- metadata.gz: ae38f8a918d857283183bde7f003f44b88be59fa67213be58fb9267e2fb51853fba7422918fb1c133c526c401e0baabae7c83ed753ffc38f887a56055884d74e
7
- data.tar.gz: a4d66506d58d10211aa776606f022fda66c038c7c209d3df92e11bf229bf50b990e6c05351287408fe6acce69386f6e18013730563233fbecbc01ebe577471d3
6
+ metadata.gz: ebfda44f713c3dcda9df0439f073a1c075360c14cc41e99c400db8eec25f06033ba2d788b5c4b7b8715eeb5cc085e6c704f6e9bad08bfd6af7b4bc4d051a8c32
7
+ data.tar.gz: cb60f13ddad1e17a7cdbbd19add866677f92590cb7072dd5dc3cf70b80c93a31e0e857366908ff214bb0cc5b2c585415abd78338bd8479ea3150ebf58f5d2117
data/LICENSE CHANGED
@@ -1,4 +1,4 @@
1
- Copyright (c) 2010-2013 Dmitry Ustalov
1
+ Copyright (c) 2010-2014 Dmitry Ustalov
2
2
 
3
3
  Permission is hereby granted, free of charge, to any person obtaining
4
4
  a copy of this software and associated documentation files (the
data/README.md CHANGED
@@ -1,7 +1,7 @@
1
1
  # Greeb
2
2
  Greeb [grʲip] is a simple yet awesome and Unicode-aware text segmentator
3
- that is based on regular expressions. API documentation is available at
4
- <http://rubydoc.info/github/dmchk/greeb/master/frames>.
3
+ that is based on regular expressions. The API documentation is available
4
+ at <http://rubydoc.info/github/dmchk/greeb/master/frames>.
5
5
 
6
6
  ## Installation
7
7
  Add this line to your application's Gemfile:
@@ -134,12 +134,12 @@ addresses. Greeb can help you in these strings retrieval.
134
134
  ```ruby
135
135
  text = 'My website is http://nlpub.ru and e-mail is example@example.com.'
136
136
 
137
- pp Greeb::Parser.urls(text).map { |e| [e, text[e.from...e.to]] }
137
+ pp Greeb::Parser.urls(text).map { |e| [e, e.slice(text)] }
138
138
  =begin
139
139
  [[#<struct Greeb::Span from=14, to=29, type=:url>, "http://nlpub.ru"]]
140
140
  =end
141
141
 
142
- pp Greeb::Parser.emails(text).map { |e| [e, text[e.from...e.to]] }
142
+ pp Greeb::Parser.emails(text).map { |e| [e, e.slice(text)] }
143
143
  =begin
144
144
  [[#<struct Greeb::Span from=44, to=63, type=:email>, "example@example.com"]]
145
145
  =end
@@ -151,7 +151,7 @@ Please don't use Greeb in spam lists development purposes.
151
151
  ```ruby
152
152
  text = 'Hello, G.L.H.F. everyone!'
153
153
 
154
- pp Greeb::Parser.abbrevs(text).map { |e| [e, text[e.from...e.to]] }
154
+ pp Greeb::Parser.abbrevs(text).map { |e| [e, e.slice(text)] }
155
155
  =begin
156
156
  [[#<struct Greeb::Span from=7, to=15, type=:abbrev>, "G.L.H.F."]]
157
157
  =end
@@ -164,7 +164,7 @@ situations.
164
164
  ```ruby
165
165
  text = 'Our time is running out: 13:37 or 14:89.'
166
166
 
167
- pp Greeb::Parser.time(text).map { |e| [e, text[e.from...e.to]] }
167
+ pp Greeb::Parser.time(text).map { |e| [e, e.slice(text)] }
168
168
  =begin
169
169
  [[#<struct Greeb::Span from=25, to=30, type=:time>, "13:37"]]
170
170
  =end
@@ -194,6 +194,6 @@ There are several span types at the tokenization stage: `:letter`,
194
194
 
195
195
  ## Copyright
196
196
 
197
- Copyright (c) 2010-2013 [Dmitry Ustalov]. See LICENSE for details.
197
+ Copyright (c) 2010-2014 [Dmitry Ustalov]. See LICENSE for details.
198
198
 
199
- [Dmitry Ustalov]: http://eveel.ru
199
+ [Dmitry Ustalov]: http://ustalov.name/
data/bin/greeb CHANGED
@@ -1,13 +1,45 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
+ require 'ostruct'
4
+ require 'optparse'
5
+
3
6
  if File.exists? File.expand_path('../../.git', __FILE__)
4
7
  $LOAD_PATH.unshift File.expand_path('../../lib', __FILE__)
5
8
  end
6
9
 
7
10
  require 'greeb'
8
11
 
9
- text = STDIN.read.tap(&:chomp!)
12
+ options = OpenStruct.new(input: STDIN, output: STDOUT)
13
+
14
+ optparse = OptionParser.new do |opts|
15
+ opts.banner = 'Usage: %s [options] command' % $PROGRAM_NAME
16
+
17
+ opts.on '-i', '--input [file]', 'Input file' do |input|
18
+ options.input = File.open(input)
19
+ at_exit { options.input.close }
20
+ end
21
+
22
+ opts.on '-o', '--output [file]', 'Output file' do |output|
23
+ options.output = File.open(output)
24
+ at_exit { options.output.close }
25
+ end
26
+
27
+ opts.on_tail '-h', '--help', 'Just display this help' do
28
+ puts opts
29
+ exit
30
+ end
31
+
32
+ opts.on_tail '-v', '--version', 'Just print the version infomation' do
33
+ puts 'Greeb %s' % Greeb::VERSION
34
+ exit
35
+ end
36
+ end
37
+
38
+ optparse.parse!
39
+
40
+ text = options.input.read.tap(&:chomp!)
10
41
 
11
42
  Greeb[text].each do |span|
12
- puts text[span.from...span.to] unless [:space, :break].include? span.type
43
+ next if [:space, :break].include? span.type
44
+ options.output.puts text[span.from...span.to]
13
45
  end
@@ -1,5 +1,3 @@
1
- # encoding: utf-8
2
-
3
1
  require 'greeb/version'
4
2
  require 'greeb/exceptions'
5
3
  require 'greeb/span'
@@ -11,7 +11,8 @@ module Greeb::Core
11
11
 
12
12
  # Recognize e-mail addresses in the input text.
13
13
  #
14
- # @param text [String] input text.
14
+ # @param text [String] an input text.
15
+ # @param helpers [Array<Symbol>] a set of helper identifiers.
15
16
  #
16
17
  # @return [Array<Greeb::Span>] a set of tokens.
17
18
  #
@@ -27,7 +28,6 @@ module Greeb::Core
27
28
 
28
29
  alias_method :'[]', :analyze
29
30
 
30
- protected
31
31
  # Extact spans of the specified type from the input spans set.
32
32
  #
33
33
  # @param spans [Array<Greeb::Span>] input spans set.
@@ -8,20 +8,33 @@ module Greeb::Parser
8
8
  extend self
9
9
 
10
10
  # An URL pattern. Not so precise, but IDN-compatible.
11
+ #
11
12
  URL = %r{\b(([\w-]+://?|www[.])[^\s()<>]+(?:\([\p{L}\w\d]+\)|([^.\s]|/)))}i
12
13
 
13
14
  # A horrible e-mail pattern.
15
+ #
14
16
  EMAIL = /[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}/i
15
17
 
16
18
  # Another horrible pattern. Now for abbreviations.
19
+ #
17
20
  ABBREV = /\b((-{0,1}\p{L}\.)*|(-{0,1}\p{L}\. )*)-{0,1}\p{L}\./i
18
21
 
19
22
  # This pattern matches anything that looks like HTML. Or not.
23
+ #
20
24
  HTML = /<(.*?)>/i
21
25
 
22
26
  # Time pattern.
27
+ #
23
28
  TIME = /\b(\d|[0-2]\d):[0-6]\d(:[0-6]\d){0,1}\b/i
24
29
 
30
+ # Apostrophe pattern.
31
+ #
32
+ APOSTROPHE = /['’]/i
33
+
34
+ # Together pattern.
35
+ #
36
+ TOGETHER = [:letter, :integer, :apostrophe, :together]
37
+
25
38
  # Recognize URLs in the input text. Actually, URL is obsolete standard
26
39
  # and this code should be rewritten to use the URI concept.
27
40
  #
@@ -73,6 +86,54 @@ module Greeb::Parser
73
86
  scan(text, TIME, :time)
74
87
  end
75
88
 
89
+ # Retrieve apostrophes from the tokenized text. The algorithm may be
90
+ # more optimal.
91
+ #
92
+ # @param text [String] input text.
93
+ # @param spans [Array<Greeb::Span>] already tokenized text.
94
+ #
95
+ # @return [Array<Greeb::Span>] retrieved apostrophes.
96
+ #
97
+ def apostrophes(text, spans)
98
+ apostrophes = scan(text, APOSTROPHE, :apostrophe)
99
+ return [] if apostrophes.empty?
100
+
101
+ apostrophes.each { |s| Greeb.extract_spans(spans, s) }.clear
102
+
103
+ spans.each_with_index.each_cons(3).reverse_each do |(s1, i), (s2, j), (s3, k)|
104
+ next unless s1 && s1.type == :letter
105
+ next unless s2 && s2.type == :apostrophe
106
+ next unless !s3 || s3 && s3.type == :letter
107
+ s3, k = s2, j unless s3
108
+ apostrophes << Greeb::Span.new(s1.from, s3.to, s1.type)
109
+ spans[i..k] = apostrophes.last
110
+ end
111
+
112
+ apostrophes
113
+ end
114
+
115
+ # Merge some spans that are together.
116
+ #
117
+ # @param spans [Array<Greeb::Span>] already tokenized text.
118
+ #
119
+ # @return [Array<Greeb::Span>] merged spans.
120
+ #
121
+ def together(spans)
122
+ loop do
123
+ converged = true
124
+
125
+ spans.each_with_index.each_cons(2).reverse_each do |(s1, i), (s2, j)|
126
+ next unless TOGETHER.include?(s1.type) && TOGETHER.include?(s2.type)
127
+ spans[i..j] = Greeb::Span.new(s1.from, s2.to, :together)
128
+ converged = false
129
+ end
130
+
131
+ break if converged
132
+ end
133
+
134
+ spans
135
+ end
136
+
76
137
  private
77
138
  # Implementation of regexp-based {Greeb::Span} scanner.
78
139
  #
@@ -55,12 +55,12 @@ class Greeb::Segmentator
55
55
  # process.
56
56
  # @param stop_marks [Array<Symbol>] an array that stores the
57
57
  # correspondent stop marks of the necessary spans.
58
+ # @param collection [Array<Greeb::Span>] an initial set of spans
59
+ # to be populated.
58
60
  #
59
- # @return [Array<Greeb::Span>] a set of entites.
61
+ # @return [Array<Greeb::Span>] a modified collection.
60
62
  #
61
- def detect_spans(sample, stop_marks)
62
- collection = []
63
-
63
+ def detect_spans(sample, stop_marks, collection = [])
64
64
  rest = tokens.inject(sample.dup) do |span, token|
65
65
  next span if sentence_aint_start? span, token
66
66
  span.from = token.from unless span.from
@@ -77,11 +77,7 @@ class Greeb::Segmentator
77
77
  span
78
78
  end
79
79
 
80
- if rest.from && rest.to
81
- collection << rest
82
- else
83
- collection
84
- end
80
+ rest.from && rest.to ? collection << rest : collection
85
81
  end
86
82
 
87
83
  private
@@ -19,6 +19,16 @@ class Greeb::Span < Struct.new(:from, :to, :type)
19
19
  Struct.new(*self.members, *members)
20
20
  end
21
21
 
22
+ # Select the slice of the given text using coorinates of this span.
23
+ #
24
+ # @param text [String] a text to be extracted.
25
+ #
26
+ # @return [String] the retrieved substring.
27
+ #
28
+ def slice(text)
29
+ text[from...to]
30
+ end
31
+
22
32
  # @private
23
33
  def <=> other
24
34
  if (comparison = self.from <=> other.from) == 0
@@ -47,6 +47,8 @@ module Greeb::Tokenizer
47
47
 
48
48
  # Perform the tokenization process.
49
49
  #
50
+ # @param text [String] a text to be tokenized.
51
+ #
50
52
  # @return [Array<Greeb::Span>] a set of tokens.
51
53
  #
52
54
  def tokenize text
@@ -5,5 +5,5 @@
5
5
  module Greeb
6
6
  # Version of Greeb.
7
7
  #
8
- VERSION = '0.2.2.1'
8
+ VERSION = '0.2.3'
9
9
  end
@@ -21,4 +21,12 @@ describe 'CLI' do
21
21
  invoke(stdin: 'Hello example@example.com guys!').must_equal(
22
22
  %w(Hello example@example.com guys !))
23
23
  end
24
+
25
+ it 'should print version' do
26
+ invoke('-v').join.must_match(/\AGreeb (\d\.)+\d\z/)
27
+ end
28
+
29
+ it 'should print help' do
30
+ invoke('-h').join.must_match(/Usage/)
31
+ end
24
32
  end
@@ -8,9 +8,11 @@ describe Parser do
8
8
  'I am к.ф.-м.н. My website is http://вася.рф/. And my e-mail is ' \
9
9
  'example@example.com! It is available by URL: http://vasya.ru. ' \
10
10
  'Also, <b>G.L.H.F.</b> everyone! It\'s 13:37 or 00:02:28 right ' \
11
- 'now, not 14:89.').freeze
11
+ 'now, not 14:89. What about some Nagibator228?').freeze
12
12
  end
13
13
 
14
+ let(:spans) { Tokenizer.tokenize(text) }
15
+
14
16
  describe 'URL' do
15
17
  subject { Parser.urls(text) }
16
18
 
@@ -67,4 +69,24 @@ describe Parser do
67
69
  )
68
70
  end
69
71
  end
72
+
73
+ describe 'APOSTROPHE' do
74
+ subject { Parser.apostrophes(text, spans.dup) }
75
+
76
+ it 'recognizes apostrophes' do
77
+ subject.must_equal(
78
+ [Span.new(220, 224, :letter)]
79
+ )
80
+ end
81
+ end
82
+
83
+ describe 'TOGETHER' do
84
+ subject { Parser.together(spans.dup) }
85
+
86
+ it 'merges connected spans' do
87
+ subject.select { |s| s.type == :together }.must_equal(
88
+ [Span.new(281, 293, :together)]
89
+ )
90
+ end
91
+ end
70
92
  end
@@ -60,4 +60,14 @@ describe Span do
60
60
  Span.new(1, 2, 3).wont_equal Span.new(1, 2, 4)
61
61
  end
62
62
  end
63
+
64
+ describe 'slicing' do
65
+ let(:text) { 'test228' }
66
+
67
+ subject { Span.new(4, 7) }
68
+
69
+ it 'should extract slices using #slice' do
70
+ subject.slice(text).must_equal '228'
71
+ end
72
+ end
63
73
  end
@@ -19,11 +19,15 @@ class MiniTest::Test
19
19
  arguments = argv.dup
20
20
  options = (arguments.last.is_a? Hash) ? arguments.pop : {}
21
21
  executable = File.expand_path('../../../bin/greeb', __FILE__)
22
+ status = nil
22
23
 
23
- Open3.popen3(executable, *arguments) do |i, o, *_|
24
+ Open3.popen3(executable, *arguments) do |i, o, _, t|
24
25
  i.puts options[:stdin] if options[:stdin]
25
26
  i.close
26
27
  invoke_cache[argv] = o.readlines.map(&:chomp!)
28
+ status = t.value
27
29
  end
30
+
31
+ invoke_cache[argv] if status.success?
28
32
  end
29
33
  end
metadata CHANGED
@@ -1,27 +1,27 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: greeb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.2.1
4
+ version: 0.2.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Dmitry Ustalov
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2013-11-15 00:00:00.000000000 Z
11
+ date: 2014-05-25 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: minitest
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
- - - ~>
17
+ - - "~>"
18
18
  - !ruby/object:Gem::Version
19
19
  version: '5.0'
20
20
  type: :development
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
- - - ~>
24
+ - - "~>"
25
25
  - !ruby/object:Gem::Version
26
26
  version: '5.0'
27
27
  description: Greeb is a simple yet awesome and Unicode-aware regexp-based tokenizer,
@@ -33,8 +33,8 @@ executables:
33
33
  extensions: []
34
34
  extra_rdoc_files: []
35
35
  files:
36
- - .gitignore
37
- - .travis.yml
36
+ - ".gitignore"
37
+ - ".travis.yml"
38
38
  - Gemfile
39
39
  - LICENSE
40
40
  - README.md
@@ -68,17 +68,17 @@ require_paths:
68
68
  - lib
69
69
  required_ruby_version: !ruby/object:Gem::Requirement
70
70
  requirements:
71
- - - '>='
71
+ - - ">="
72
72
  - !ruby/object:Gem::Version
73
73
  version: '0'
74
74
  required_rubygems_version: !ruby/object:Gem::Requirement
75
75
  requirements:
76
- - - '>='
76
+ - - ">="
77
77
  - !ruby/object:Gem::Version
78
78
  version: '0'
79
79
  requirements: []
80
80
  rubyforge_project: greeb
81
- rubygems_version: 2.1.9
81
+ rubygems_version: 2.2.2
82
82
  signing_key:
83
83
  specification_version: 4
84
84
  summary: Greeb is a simple Unicode-aware regexp-based tokenizer.