chunkify 0.1.3 → 0.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: b1714738ae78ae142793b80dd761e0a64413fd25ae73fa76ce60e0f099722903
4
- data.tar.gz: 376154406ea4b78983b5957030b3aa81767eca3b885c2a30b2b853f08bec724a
3
+ metadata.gz: bd15841c4753a975e1259857c96c524ec5e0c435d9adca98edf59a93cb9ca045
4
+ data.tar.gz: 3f06158f83273718b61c4a87809b53490acd209ff9e2c0e12593526cde195f1c
5
5
  SHA512:
6
- metadata.gz: d7b0f5f0436434fe235f4729c755135e21fa2002149f311523b6900379433d66351e5feaaa2880de5e6ba86c873160d719c4fd7432b31df1362a95a27d1dc9ea
7
- data.tar.gz: 494bcba38175ec9a10fffea9016708351f7b289e38d341cf2e8bc40b007de8c96ab7f61cb34bafb0b3637b8384a8a7c755b81f14e252e8a10323e8a12326786e
6
+ metadata.gz: e78c1a25870032ee80ba74d7e218743ccdbde6af6e64e5dcc4c12befceaa1f8d997adf77fa8be9529c6c885988a07f1504a5f38b9a8d45fde2611b7990f0c251
7
+ data.tar.gz: 4b778ecd6c56a9b413ef813f343bbba8dc18d9e051d62229339b7c10f5903e1b27ead3bb73ad7e3798b6125590503e56f3203aa4d62a9e42420dfdcfa01eabdd
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- chunkify (0.1.0)
4
+ chunkify (0.1.4)
5
5
  pry
6
6
 
7
7
  GEM
@@ -13,6 +13,7 @@ GEM
13
13
  coderay (~> 1.1)
14
14
  method_source (~> 1.0)
15
15
  rake (13.3.1)
16
+ tokenizer (0.3.0)
16
17
  yard (0.9.38)
17
18
 
18
19
  PLATFORMS
@@ -21,6 +22,7 @@ PLATFORMS
21
22
  DEPENDENCIES
22
23
  chunkify!
23
24
  rake (~> 13.0)
25
+ tokenizer
24
26
  yard (~> 0.9.38)
25
27
 
26
28
  BUNDLED WITH
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Chunkify
4
- VERSION = "0.1.3"
4
+ VERSION = "0.1.5"
5
5
  end
data/lib/chunkify.rb CHANGED
@@ -1,39 +1,59 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ require 'tokenizer'
3
4
  require_relative "chunkify/version"
4
5
 
5
6
  module Chunkify
6
7
  class Error < StandardError; end
7
-
8
- @@SIZE = 1024
9
-
10
- def self.size! input
11
- @@SIZE = 2 ** input.split(" ").length
12
- end
13
-
14
- def self.size= s
15
- @@SIZE = s
16
- end
17
-
18
- def self.size
19
- @@SIZE
8
+ def self.tokens text
9
+ Tokenizer::WhitespaceTokenizer.new(:en).tokenize(text)
20
10
  end
21
-
22
- def self.split text, &b
23
- if !block_given?
24
- b = lambda { |e| e }
25
- end
11
+ def self.split text, size: 1024
26
12
  chunk = []
27
- paragraphs = text.gsub(/\r/, "").gsub(/\n\n+/,"\n\n").split(/\n\n+/)
13
+ paragraphs = text.gsub(/\r/, "").gsub(/\n\n+/,"\n\n").split(/\n\n+/).map { |e| e.split("\n").map { |ee| ee.strip.gsub(/ +/, " ") }.join("\n") }
28
14
  current_chunk = ""
29
15
  paragraphs.each do |para|
30
- if current_chunk.length + para.length > Chunkify.size && !current_chunk.empty?
31
- chunk << b.call(current_chunk)
32
- current_chunk = para.split("\n").map { |e| e.strip.gsub(/ +/, " ") }.join("\n")
16
+ if /Project Gutenberg/.match(para) || /gutenberg.org/.match(para)
17
+ next
33
18
  else
34
- current_chunk += (current_chunk.empty? ? "" : "\n\n") + para.split("\n").map { |e| e.strip.gsub(/ +/, " ") }.join("\n")
19
+ if current_chunk.length + para.length > size && !current_chunk.empty?
20
+ # premptive newline
21
+ chunk << current_chunk
22
+ # set next chunk
23
+ current_chunk = para
24
+ else
25
+ # add normally
26
+ current_chunk += (current_chunk.empty? ? "" : "\n\n") + para
27
+ end
35
28
  end
36
29
  end
37
- chunk << b.call(current_chunk) unless current_chunk.empty?
30
+ chunk << current_chunk unless current_chunk.empty?
31
+ end
32
+ @@DOC = Hash.new { |h,k| h[k] = Doc.new(k) }
33
+ class Doc
34
+ attr_accessor :doc
35
+ def initialize k
36
+ @id = k
37
+ end
38
+ # chunk document (... and handle)
39
+ def chunk &b
40
+ if block_given?
41
+ Chunkify.split(@doc).map { |e| b.call(e) }
42
+ else
43
+ Chunkify.split(@doc)
44
+ end
45
+ end
46
+ # chunk document to tokens (... and handle)
47
+ def tokens &b
48
+ if block_given?
49
+ chunk { |e| b.call(Chunkify.tokens(e)) }
50
+ else
51
+ chunk.map { |e| Chunkify.tokens(e) }
52
+ end
53
+ end
54
+ end
55
+ # document
56
+ def self.[] k
57
+ @@DOC[k]
38
58
  end
39
59
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: chunkify
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.3
4
+ version: 0.1.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - Erik Olson
@@ -38,6 +38,20 @@ dependencies:
38
38
  - - ">="
39
39
  - !ruby/object:Gem::Version
40
40
  version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: tokenizer
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
41
55
  description:
42
56
  email:
43
57
  - xorgnak@xorgnak.com