simple_text_extract 3.0.8 → 3.0.9

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 44412a154f56b1100983582f15f3d56800edf040dfac98a5ae9fb9b3fa2d6401
4
- data.tar.gz: a6c5b2d94e13be12c71d3b4a66d0c2aef8fc6b7d7084a4c2b2d490e49954916d
3
+ metadata.gz: b19c5d7467780a0ccb69fb18e29eb81c9acb7e16caf39a93ddeedaf325d541bd
4
+ data.tar.gz: 942c5a13853805d7e66feacf997e5a9181e69ea309a880916ca08653c0e3e8bd
5
5
  SHA512:
6
- metadata.gz: 88e630c8aeee64240196e96c93675ba15a1a7c2f3c2c1a67ab51a05d35dc404a11574f61995628b2b399cd4d42570d40beddb55c0b6cda0bccece97415d995a7
7
- data.tar.gz: 42ff8b4a4702c52702ac95b2bab6b79495384cdec6d9f68011fb04bfca778cf91442c010ab67ac0a1ee59d27148b00be43221fbd20392d6e96f53d4ec5ae7788
6
+ metadata.gz: 42303612565dbfe9f23f699c3e554fb9509bbf7165e24d5b4b1fbea3f0ffe0bcd9f8723ac39861341f978b734ab671d17f6aa5a530e6fcf47dddfbe6e796acd1
7
+ data.tar.gz: c07deca30b24a70fef2baf7967bcfb411d4a60bdd24c624ce0a328e83be4ed558dca2628f468fa27b9d66e5e317413539d15d7b1bb551a8ec987d80eb69723a6
data/Gemfile.lock CHANGED
@@ -1,7 +1,9 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- simple_text_extract (3.0.8)
4
+ simple_text_extract (3.0.9)
5
+ base64
6
+ csv
5
7
  roo (~> 2.10.0)
6
8
  rubyzip (~> 2.3.2)
7
9
  spreadsheet (~> 1.3.0)
@@ -10,61 +12,62 @@ GEM
10
12
  remote: https://rubygems.org/
11
13
  specs:
12
14
  ast (2.4.2)
13
- bigdecimal (3.1.8)
15
+ base64 (0.2.0)
16
+ bigdecimal (3.1.9)
14
17
  coderay (1.1.3)
15
- json (2.7.2)
18
+ csv (3.3.2)
19
+ json (2.9.1)
16
20
  language_server-protocol (3.17.0.3)
17
- memory_profiler (1.0.2)
21
+ memory_profiler (1.1.0)
18
22
  method_source (1.1.0)
19
- minitest (5.24.1)
20
- mocha (2.4.0)
23
+ minitest (5.25.4)
24
+ mocha (2.7.1)
21
25
  ruby2_keywords (>= 0.0.5)
22
- nokogiri (1.16.6-arm64-darwin)
26
+ nokogiri (1.18.1-arm64-darwin)
23
27
  racc (~> 1.4)
24
- nokogiri (1.16.6-x86_64-linux)
28
+ nokogiri (1.18.1-x86_64-linux-gnu)
25
29
  racc (~> 1.4)
26
- parallel (1.25.1)
27
- parser (3.3.4.0)
30
+ parallel (1.26.3)
31
+ parser (3.3.6.0)
28
32
  ast (~> 2.4.1)
29
33
  racc
30
- pry (0.14.2)
34
+ pry (0.15.2)
31
35
  coderay (~> 1.1)
32
36
  method_source (~> 1.0)
33
- racc (1.8.0)
37
+ racc (1.8.1)
34
38
  rainbow (3.1.1)
35
39
  rake (13.2.1)
36
- regexp_parser (2.9.2)
37
- rexml (3.3.1)
38
- strscan
40
+ regexp_parser (2.10.0)
39
41
  roo (2.10.1)
40
42
  nokogiri (~> 1)
41
43
  rubyzip (>= 1.3.0, < 3.0.0)
42
- rubocop (1.65.0)
44
+ rubocop (1.69.2)
43
45
  json (~> 2.3)
44
46
  language_server-protocol (>= 3.17.0)
45
47
  parallel (~> 1.10)
46
48
  parser (>= 3.3.0.2)
47
49
  rainbow (>= 2.2.2, < 4.0)
48
- regexp_parser (>= 2.4, < 3.0)
49
- rexml (>= 3.2.5, < 4.0)
50
- rubocop-ast (>= 1.31.1, < 2.0)
50
+ regexp_parser (>= 2.9.3, < 3.0)
51
+ rubocop-ast (>= 1.36.2, < 2.0)
51
52
  ruby-progressbar (~> 1.7)
52
- unicode-display_width (>= 2.4.0, < 3.0)
53
- rubocop-ast (1.31.3)
53
+ unicode-display_width (>= 2.4.0, < 4.0)
54
+ rubocop-ast (1.37.0)
54
55
  parser (>= 3.3.1.0)
55
56
  ruby-ole (1.2.13.1)
56
57
  ruby-progressbar (1.13.0)
57
58
  ruby2_keywords (0.0.5)
58
59
  rubyzip (2.3.2)
59
- spreadsheet (1.3.1)
60
+ spreadsheet (1.3.3)
60
61
  bigdecimal
61
62
  ruby-ole
62
- strscan (3.1.0)
63
- unicode-display_width (2.5.0)
63
+ unicode-display_width (3.1.3)
64
+ unicode-emoji (~> 4.0, >= 4.0.4)
65
+ unicode-emoji (4.0.4)
64
66
 
65
67
  PLATFORMS
66
68
  arm64-darwin-21
67
69
  arm64-darwin-23
70
+ arm64-darwin-24
68
71
  x86_64-linux
69
72
 
70
73
  DEPENDENCIES
@@ -1,5 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ require "open3"
4
+
3
5
  class SimpleTextExtract::Extract # rubocop:disable Metrics/ClassLength
4
6
  def self.formatter(path)
5
7
  case path
@@ -78,7 +80,13 @@ class SimpleTextExtract::Extract # rubocop:disable Metrics/ClassLength
78
80
  def pdf_extract
79
81
  return nil if SimpleTextExtract.missing_dependency?("pdftotext")
80
82
 
81
- `pdftotext #{Shellwords.escape(file.path)} - 2>/dev/null`
83
+ stdout, stderr, status = Open3.capture3("pdftotext #{Shellwords.escape(file.path)} -")
84
+ if status.success?
85
+ stdout
86
+ else
87
+ warn "pdftotext failed: #{stderr}"
88
+ nil
89
+ end
82
90
  end
83
91
 
84
92
  def xlsx_extract
@@ -117,7 +125,13 @@ class SimpleTextExtract::Extract # rubocop:disable Metrics/ClassLength
117
125
  def doc_extract
118
126
  return nil if SimpleTextExtract.missing_dependency?("antiword")
119
127
 
120
- `antiword #{Shellwords.escape(file.path)}`
128
+ stdout, stderr, status = Open3.capture3("antiword #{Shellwords.escape(file.path)}")
129
+ if status.success?
130
+ stdout
131
+ else
132
+ warn "antiword failed: #{stderr}"
133
+ nil
134
+ end
121
135
  end
122
136
 
123
137
  def docx_extract
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module SimpleTextExtract
4
- VERSION = "3.0.8"
4
+ VERSION = "3.0.9"
5
5
  end
@@ -29,4 +29,6 @@ Gem::Specification.new do |spec|
29
29
  spec.add_dependency "roo", "~> 2.10.0"
30
30
  spec.add_dependency "rubyzip", "~> 2.3.2"
31
31
  spec.add_dependency "spreadsheet", "~> 1.3.0"
32
+ spec.add_dependency "base64"
33
+ spec.add_dependency "csv"
32
34
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: simple_text_extract
3
3
  version: !ruby/object:Gem::Version
4
- version: 3.0.8
4
+ version: 3.0.9
5
5
  platform: ruby
6
6
  authors:
7
7
  - Nick Weiland
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2024-07-12 00:00:00.000000000 Z
11
+ date: 2024-12-31 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: roo
@@ -52,6 +52,34 @@ dependencies:
52
52
  - - "~>"
53
53
  - !ruby/object:Gem::Version
54
54
  version: 1.3.0
55
+ - !ruby/object:Gem::Dependency
56
+ name: base64
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: csv
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
55
83
  description: Extract text from various file types before resorting to an OCR solution.
56
84
  email:
57
85
  - nickweiland@gmail.com
@@ -97,7 +125,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
97
125
  requirements:
98
126
  - antiword
99
127
  - pdftotext/poppler
100
- rubygems_version: 3.5.15
128
+ rubygems_version: 3.5.21
101
129
  signing_key:
102
130
  specification_version: 4
103
131
  summary: Extract text from various file types before resorting to an OCR solution.