act_as_page_extractor 0.6.0 → 0.6.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 6fd076b19cd732c20e8aea0fd17f280523ff1b3013778d599757095c1f3e1e88
4
- data.tar.gz: 41832e5eb175e7417b2862a72fa72b063a60a95692f5116b4dab98f5401b37bb
3
+ metadata.gz: d0bd64f8e12d0c7bb3a75893738e30af616e4bcc5b958b18853b35363823b5ef
4
+ data.tar.gz: d87505b025bd924a545e2f6cbd9071958d65f993a234092f65b3cb7e108b16b1
5
5
  SHA512:
6
- metadata.gz: c2e3a77a42c332c7666b9e474b83dbb72527aa1a22833a3e59ecd97afa9145699479ade5dd2f8e6f1e434ebd8a11179179d96553e86297e984cf6808f086d78b
7
- data.tar.gz: 91b844ef793d0724454cb92c92e5600cde2924ca4925d80b2150fb25c7bfed6354b8bd817fde978da2d35928f47ce8f11c36934d739b181aa95dce802e4e0232
6
+ metadata.gz: af0708407f3b4546424e1666926c248cdb9fe0813ede2dd642d099836282d2f608d8d47edcd5cd513cef9b3ead231c192f6b815ec7721eb141b6820f561d0f30
7
+ data.tar.gz: 6a5969118ff6a6141aaaec8989e38670f75a817afa230636ed84f9f2a4e7c1f160ee569664f940e1355cfc74dff56e30370ca993fbb38d2a7139c17f56858acf
data/Gemfile CHANGED
@@ -3,7 +3,7 @@ source 'https://rubygems.org'
3
3
  # Specify your gem's dependencies in total_compressor.gemspec
4
4
  gemspec
5
5
 
6
- gem 'activerecord', '~> 6.0'
6
+ gem 'activerecord', '~> 6'
7
7
 
8
8
  gem 'awesome_print'
9
9
 
data/Gemfile.lock CHANGED
@@ -1,14 +1,14 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- act_as_page_extractor (0.2.3)
5
- activerecord (~> 6.0)
4
+ act_as_page_extractor (0.6.1)
5
+ activerecord (~> 6)
6
6
  awesome_print (~> 1)
7
7
  docsplit (~> 0)
8
8
  filesize (~> 0)
9
- pdf-reader (~> 1.4.0, >= 1.4.0)
9
+ pdf-reader (~> 1, >= 1.4)
10
10
  pdf_utils (~> 0)
11
- prawn (~> 0.7.1)
11
+ prawn (~> 1)
12
12
  total_compressor (~> 0)
13
13
 
14
14
  GEM
@@ -38,6 +38,7 @@ GEM
38
38
  i18n (1.14.5)
39
39
  concurrent-ruby (~> 1.0)
40
40
  minitest (5.25.1)
41
+ pdf-core (0.4.0)
41
42
  pdf-reader (1.4.1)
42
43
  Ascii85 (~> 1.0.0)
43
44
  afm (~> 0.2.1)
@@ -45,13 +46,9 @@ GEM
45
46
  ruby-rc4
46
47
  ttfunk
47
48
  pdf_utils (0.1.0)
48
- prawn (0.7.2)
49
- prawn-core (>= 0.7.2, < 0.8)
50
- prawn-layout (>= 0.7.2, < 0.8)
51
- prawn-security (>= 0.7.1, < 0.8)
52
- prawn-core (0.7.2)
53
- prawn-layout (0.7.2)
54
- prawn-security (0.7.1)
49
+ prawn (1.3.0)
50
+ pdf-core (~> 0.4.0)
51
+ ttfunk (~> 1.4.0)
55
52
  rake (12.3.3)
56
53
  rspec (3.13.0)
57
54
  rspec-core (~> 3.13.0)
@@ -77,7 +74,7 @@ GEM
77
74
  total_compressor (0.1.11)
78
75
  awesome_print (~> 1.1, >= 1.1.0)
79
76
  rubyzip (~> 1.2, >= 1.2.2)
80
- ttfunk (1.7.0)
77
+ ttfunk (1.4.0)
81
78
  tzinfo (2.0.6)
82
79
  concurrent-ruby (~> 1.0)
83
80
  zeitwerk (2.6.17)
@@ -87,16 +84,16 @@ PLATFORMS
87
84
 
88
85
  DEPENDENCIES
89
86
  act_as_page_extractor!
90
- activerecord (~> 6.0)
87
+ activerecord (~> 6)
91
88
  awesome_print
92
- bundler (~> 1.3)
89
+ bundler (~> 1)
93
90
  byebug
94
91
  docsplit
95
92
  filesize
96
93
  pdf-reader
97
94
  pdf_utils
98
95
  prawn
99
- rake (~> 12.3, >= 12.3.3)
96
+ rake (~> 12, >= 12.3.3)
100
97
  rspec
101
98
  simplecov
102
99
  total_compressor
data/README.md CHANGED
@@ -7,102 +7,111 @@ Library for extracting plain text from documents(files) for further processing (
7
7
 
8
8
  Install appropriate tools before using:
9
9
 
10
- sudo apt-get install zlib1g zlib1g-dev zip rar p7zip-full
11
-
10
+ ```sh
11
+ sudo apt-get install zlib1g zlib1g-dev zip rar p7zip-full
12
+ ```
12
13
  Add this line to your application's Gemfile:
13
14
 
14
- gem 'act_as_page_extractor'
15
-
15
+ ```rb
16
+ gem 'act_as_page_extractor'
17
+ bundle
18
+ ```
16
19
  ## Usage
17
20
 
18
- For example, for model Document we need execute:
21
+ For example, for model Document in the Rails framework we need run:
19
22
 
20
- $ bundle
21
- $ rails g act_as_page_extractor:migration Document category_id user_id
23
+ ```sh
24
+ rails g act_as_page_extractor:migration Document category_id user_id
25
+ ```
22
26
 
23
27
  As a result we get two migration files:
24
-
25
- class AddPageExtractorFields < ActiveRecord::Migration
26
- def change
27
- add_column :documents, :page_extraction_state, :string, default: ''
28
- add_column :documents, :page_extraction_pages, :integer, default: 0
29
- add_column :documents, :page_extraction_doctype, :string, default: ''
30
- add_column :documents, :page_extraction_filesize, :string, default: ''
31
- end
32
- end
33
-
34
- class CreateExtractedPages < ActiveRecord::Migration
35
- def change
36
- create_table :extracted_pages do |t|
37
- t.text :page
38
- t.integer :document_id
39
- t.integer :category_id
40
- t.integer :user_id
41
- t.integer :page_number
42
-
43
- t.timestamps null: false
44
- end
45
-
46
- add_index :extracted_pages, :document_id
47
- add_index :extracted_pages, :category_id
48
- add_index :extracted_pages, [:document_id, :category_id]
49
- add_index :extracted_pages, [:document_id, :page_number]
50
- end
28
+ ```rb
29
+ class AddPageExtractorFields < ActiveRecord::Migration
30
+ def change
31
+ add_column :documents, :page_extraction_state, :string, default: ''
32
+ add_column :documents, :page_extraction_pages, :integer, default: 0
33
+ add_column :documents, :page_extraction_doctype, :string, default: ''
34
+ add_column :documents, :page_extraction_filesize, :string, default: ''
35
+ end
36
+ end
37
+
38
+ class CreateExtractedPages < ActiveRecord::Migration
39
+ def change
40
+ create_table :extracted_pages do |t|
41
+ t.text :page
42
+ t.integer :document_id
43
+ t.integer :category_id
44
+ t.integer :user_id
45
+ t.integer :page_number
46
+
47
+ t.timestamps null: false
51
48
  end
52
49
 
50
+ add_index :extracted_pages, :document_id
51
+ add_index :extracted_pages, :category_id
52
+ add_index :extracted_pages, [:document_id, :category_id]
53
+ add_index :extracted_pages, [:document_id, :page_number]
54
+ end
55
+ end
56
+ ```
53
57
 
54
- Model Document must have field which contains path to file(supports [different archive types](https://github.com/phlowerteam/total_compressor) that contains [txt, pdf, doc/x, txt, html, rtf, ...](https://www.exoplatform.com/docs/public/index.jsp?topic=%2FPLF43%2FPLFAdminGuide.Configuration.JODConverter.html))
58
+ Model Document must have field which contains path to file(supports [different archive types](https://github.com/phlowerteam/total_compressor) that contains [txt, pdf, doc/x, txt, html, rtf, ...](https://docs-old.exoplatform.org/public/index.jsp?topic=%2FPLF41%2FPLFAdminGuide.Configuration.JODConverter.html))
55
59
 
56
60
  Add to model next parameters for initializing:
57
61
 
58
- class Document < ActiveRecord::Base
59
- include ActAsPageExtractor
62
+ ```rb
63
+ class Document < ActiveRecord::Base
64
+ include ActAsPageExtractor
60
65
 
61
- act_as_page_extractor options: {
62
- document_class: 'Document',
63
- save_as_pdf: true,
64
- filename: :filename,
65
- document_id: :document_id,
66
- additional_fields: [:category_id, :user_id],
67
- #file_storage: "/full/path/to/documents/storage",
68
- #pdf_storage: "/full/path/to/extracted/pdf/storage"
69
- }
66
+ act_as_page_extractor options: {
67
+ document_class: 'Document',
68
+ save_as_pdf: true,
69
+ filename: :filename,
70
+ document_id: :document_id,
71
+ additional_fields: [:category_id, :user_id],
72
+ #file_storage: "/full/path/to/documents/storage",
73
+ #pdf_storage: "/full/path/to/extracted/pdf/storage"
74
+ }
70
75
 
71
- has_many :extracted_pages, dependent: :destroy
72
- end
76
+ has_many :extracted_pages, dependent: :destroy
77
+ end
78
+ ```
73
79
 
74
80
  Now our instance has few new methods:
75
81
 
76
- document = Document.first
77
- document.page_extract!
78
- document.extracted_pages
79
- document.pdf_path # if option 'save_as_pdf' is 'true'
82
+ ```rb
83
+ document = Document.first
84
+ document.page_extract!
85
+ document.extracted_pages
86
+ document.pdf_path # if option 'save_as_pdf' is 'true'
80
87
 
81
- # Access to pages
82
- ExtractedPage.count
88
+ # Access to pages
89
+ ExtractedPage.count
83
90
 
84
- # Importing whole directory of documents
85
- ActAsPageExtractor.import_files('/path/to/foler/with/documents')
91
+ # Importing whole directory of documents
92
+ ActAsPageExtractor.import_files('/path/to/foler/with/documents')
86
93
 
87
- # We can use cron for run the processing of all the new documents
88
- ActAsPageExtractor.start_extraction
94
+ # We can use cron for run the processing of all the new documents
95
+ ActAsPageExtractor.start_extraction
89
96
 
90
- # Getting statistics information of all documents
91
- ActAsPageExtractor.statistics
97
+ # Getting statistics information of all documents
98
+ ActAsPageExtractor.statistics
99
+ ```
92
100
 
93
- Parameters of initializing `act_as_page_extractor options: { ... }`:
101
+ Parameters of initializing **act_as_page_extractor**:
94
102
 
95
- `document_class` - name of model (e.g. 'Document)
96
- `save_as_pdf` - boolean [true, false] when we want save temporary pdf
97
- `filename` - name of field which contains access to the file and it should be an object with 'url' method that returns path to file (e.g. CarrierWave object with 'filename.url')
98
- `document_id` - name for saving id
99
- `additional_fields` - additional fields that added to extracted page (e.g. for indexing, etc.)
100
- `file_storage` - path for saving tmp files (by default it is "public")
101
- `pdf_storage` - path for saving pdf (by default it is "public/uploads/extracted/pdf")
103
+ * **document_class** - name of model (e.g. Document)
104
+ * **save_as_pdf** - boolean [true, false] when we want save temporary pdf
105
+ * **filename** - name of field which contains access to the file and it should be an object with 'url' method that returns path to file (e.g. CarrierWave object with 'filename.url')
106
+ * **document_id** - name for saving id
107
+ * **additional_fields** - additional fields that added to extracted page (e.g. for indexing, etc.)
108
+ * **file_storage** - path for saving tmp files (by default it is "public")
109
+ * **pdf_storage** - path for saving pdf (by default it is "public/uploads/extracted/pdf")
102
110
 
103
111
  ## Run tests
104
- $ COVERAGE=true rspec
105
-
112
+ ```sh
113
+ rspec
114
+ ```
106
115
  ## Contributing
107
116
  1. Fork it
108
117
  2. Create your feature branch (`git checkout -b my-new-feature`)
@@ -115,5 +124,5 @@ https://github.com/phlowerteam
115
124
  phlowerteam@gmail.com
116
125
 
117
126
  ## License
118
- Copyright (c) 2017 PhlowerTeam
127
+ Copyright (c) 2024 PhlowerTeam
119
128
  MIT License
@@ -10,25 +10,25 @@ Gem::Specification.new do |spec|
10
10
  spec.email = ['phlowerteam@gmail.com']
11
11
  spec.description = %q{Library (Docsplit wrapper) for text extraction from pdf, doc/x, txt files with OpenOffice}
12
12
  spec.summary = %q{Uses system calls}
13
- spec.homepage = 'https://github.com/phlowerteam'
13
+ spec.homepage = 'https://github.com/phlowerteam/act_as_page_extractor'
14
14
  spec.license = 'MIT'
15
15
 
16
16
  spec.files = `git ls-files`.split($/)
17
17
  spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
18
18
  spec.require_paths = ['lib']
19
19
 
20
- spec.add_development_dependency 'bundler', '~> 1.3'
21
- spec.add_development_dependency 'rake', '~> 12.3', '>= 12.3.3'
20
+ spec.add_development_dependency 'bundler', '~> 1'
21
+ spec.add_development_dependency 'rake', '~> 12', '>= 12.3.3'
22
22
  spec.add_development_dependency 'byebug', '~> 0'
23
23
  spec.add_development_dependency 'rspec', '~> 0'
24
24
  spec.add_development_dependency 'simplecov', '~> 0'
25
25
 
26
- spec.add_runtime_dependency 'activerecord', '~> 6.0'
26
+ spec.add_runtime_dependency 'activerecord', '~> 6'
27
27
  spec.add_runtime_dependency 'awesome_print', '~> 1'
28
28
  spec.add_runtime_dependency 'docsplit', '~> 0' # API for OpenOffice jodconverter (any to pdf)
29
29
  spec.add_runtime_dependency 'pdf_utils', '~> 0' # getting text from pdf
30
- spec.add_runtime_dependency 'prawn', '~> 0.7.1' # need for pdf_utils
31
- spec.add_runtime_dependency 'pdf-reader', '~> 1.4.0', '>= 1.4.0' # need for pdf_utils
30
+ spec.add_runtime_dependency 'prawn', '~> 1' # need for pdf_utils
31
+ spec.add_runtime_dependency 'pdf-reader', '~> 1', '>= 1.4' # need for pdf_utils
32
32
  spec.add_runtime_dependency 'total_compressor', '~> 0' # decompressing files
33
33
  spec.add_runtime_dependency 'filesize', '~> 0' # pretty size of file
34
34
  end
@@ -0,0 +1,14 @@
1
+ # GEM publishing
2
+
3
+ ```sh
4
+ # Add features or fix bugs
5
+ # Increase version number x.y.z
6
+ # lib/act_as_page_extractor/version.rb
7
+ bundle update
8
+ rspec
9
+ # git commit & git push
10
+
11
+ gem build act_as_page_extractor.gemspec
12
+ gem install ./act_as_page_extractor-x.y.z.gem
13
+ gem push act_as_page_extractor-x.y.z.gem
14
+ ```
@@ -1,17 +1,17 @@
1
1
  require 'timeout'
2
2
 
3
3
  module ActAsPageExtractor
4
+ # :nocov:
4
5
  def timeout_wrapper
5
6
  result = nil
6
7
  begin
7
8
  result = Timeout::timeout(60*5) { yield }
8
9
  rescue
9
- # :nocov:
10
10
  ensure
11
- # :nocov:
12
11
  result
13
12
  end
14
13
  end
14
+ # :nocov:
15
15
 
16
16
  def is_extracted
17
17
  @pdf_pages.to_i > 0 && self.extracted_pages.count == @pdf_pages
@@ -46,11 +46,11 @@ module ActAsPageExtractor
46
46
  # ap "@copy_document_path"
47
47
  # ap @copy_document_path
48
48
  # ap "@document_path"
49
- ap @document_path
49
+ # ap @document_path
50
50
  # ap "@pdf_path"
51
51
  # ap @pdf_path
52
52
  # ap "@pdf_pages"
53
- ap @pdf_pages
53
+ # ap @pdf_pages
54
54
  end
55
55
  # :nocov:
56
56
  end
@@ -1,5 +1,5 @@
1
1
  # :nocov:
2
2
  module ActAsPageExtractor
3
- VERSION = "0.6.0"
3
+ VERSION = "0.6.2"
4
4
  end
5
5
  # :nocov:
data/spec/spec_helper.rb CHANGED
@@ -1,7 +1,11 @@
1
- if ENV['COVERAGE']
1
+ unless ENV['SKIP_COVERAGE']
2
2
  require 'simplecov'
3
- SimpleCov.start 'rails'
3
+ SimpleCov.start 'rails' do
4
+ add_filter 'vendor'
5
+ end
6
+ SimpleCov.minimum_coverage 100
4
7
  end
8
+
5
9
  require 'rspec'
6
10
  require 'support/models'
7
11
  require 'act_as_page_extractor'
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: act_as_page_extractor
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.6.0
4
+ version: 0.6.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - PhlowerTeam
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2024-08-30 00:00:00.000000000 Z
11
+ date: 2024-08-31 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -16,21 +16,21 @@ dependencies:
16
16
  requirements:
17
17
  - - "~>"
18
18
  - !ruby/object:Gem::Version
19
- version: '1.3'
19
+ version: '1'
20
20
  type: :development
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
24
  - - "~>"
25
25
  - !ruby/object:Gem::Version
26
- version: '1.3'
26
+ version: '1'
27
27
  - !ruby/object:Gem::Dependency
28
28
  name: rake
29
29
  requirement: !ruby/object:Gem::Requirement
30
30
  requirements:
31
31
  - - "~>"
32
32
  - !ruby/object:Gem::Version
33
- version: '12.3'
33
+ version: '12'
34
34
  - - ">="
35
35
  - !ruby/object:Gem::Version
36
36
  version: 12.3.3
@@ -40,7 +40,7 @@ dependencies:
40
40
  requirements:
41
41
  - - "~>"
42
42
  - !ruby/object:Gem::Version
43
- version: '12.3'
43
+ version: '12'
44
44
  - - ">="
45
45
  - !ruby/object:Gem::Version
46
46
  version: 12.3.3
@@ -92,14 +92,14 @@ dependencies:
92
92
  requirements:
93
93
  - - "~>"
94
94
  - !ruby/object:Gem::Version
95
- version: '6.0'
95
+ version: '6'
96
96
  type: :runtime
97
97
  prerelease: false
98
98
  version_requirements: !ruby/object:Gem::Requirement
99
99
  requirements:
100
100
  - - "~>"
101
101
  - !ruby/object:Gem::Version
102
- version: '6.0'
102
+ version: '6'
103
103
  - !ruby/object:Gem::Dependency
104
104
  name: awesome_print
105
105
  requirement: !ruby/object:Gem::Requirement
@@ -148,34 +148,34 @@ dependencies:
148
148
  requirements:
149
149
  - - "~>"
150
150
  - !ruby/object:Gem::Version
151
- version: 0.7.1
151
+ version: '1'
152
152
  type: :runtime
153
153
  prerelease: false
154
154
  version_requirements: !ruby/object:Gem::Requirement
155
155
  requirements:
156
156
  - - "~>"
157
157
  - !ruby/object:Gem::Version
158
- version: 0.7.1
158
+ version: '1'
159
159
  - !ruby/object:Gem::Dependency
160
160
  name: pdf-reader
161
161
  requirement: !ruby/object:Gem::Requirement
162
162
  requirements:
163
- - - ">="
164
- - !ruby/object:Gem::Version
165
- version: 1.4.0
166
163
  - - "~>"
167
164
  - !ruby/object:Gem::Version
168
- version: 1.4.0
165
+ version: '1'
166
+ - - ">="
167
+ - !ruby/object:Gem::Version
168
+ version: '1.4'
169
169
  type: :runtime
170
170
  prerelease: false
171
171
  version_requirements: !ruby/object:Gem::Requirement
172
172
  requirements:
173
- - - ">="
174
- - !ruby/object:Gem::Version
175
- version: 1.4.0
176
173
  - - "~>"
177
174
  - !ruby/object:Gem::Version
178
- version: 1.4.0
175
+ version: '1'
176
+ - - ">="
177
+ - !ruby/object:Gem::Version
178
+ version: '1.4'
179
179
  - !ruby/object:Gem::Dependency
180
180
  name: total_compressor
181
181
  requirement: !ruby/object:Gem::Requirement
@@ -221,6 +221,7 @@ files:
221
221
  - README.md
222
222
  - Rakefile
223
223
  - act_as_page_extractor.gemspec
224
+ - docs/publishing.md
224
225
  - lib/act_as_page_extractor.rb
225
226
  - lib/act_as_page_extractor/modules/extracting.rb
226
227
  - lib/act_as_page_extractor/modules/interface.rb
@@ -248,7 +249,7 @@ files:
248
249
  - test/test-doc-3-pages.rtf
249
250
  - test/test-doc-3-pages.txt
250
251
  - test/test-doc-3-pages.wrong
251
- homepage: https://github.com/phlowerteam
252
+ homepage: https://github.com/phlowerteam/act_as_page_extractor
252
253
  licenses:
253
254
  - MIT
254
255
  metadata: {}
@@ -267,7 +268,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
267
268
  - !ruby/object:Gem::Version
268
269
  version: '0'
269
270
  requirements: []
270
- rubygems_version: 3.0.3
271
+ rubygems_version: 3.3.22
271
272
  signing_key:
272
273
  specification_version: 4
273
274
  summary: Uses system calls