parsefile 0.0.7 → 0.0.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. checksums.yaml +4 -4
  2. data/lib/ocrfile.rb +6 -2
  3. metadata +2 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 0b13ab5008debd780cd15627b816d35ffc40fae1
4
- data.tar.gz: 34b49fcc2c705fb4f00cd782133b68ed44d37886
3
+ metadata.gz: 2738907314aac77a653fd655f4a200e2a63b6ed6
4
+ data.tar.gz: 8c780c9b2304699e37f71cdb7f4842918bc61fc9
5
5
  SHA512:
6
- metadata.gz: d08d0c4314107dc2aa413af4cb4144181330fd7b4c545cd6320873ff833e1fa22a33f5e9d2001831dae23bcc17b95df48f5a063849a39d0fed82c1e92209cf61
7
- data.tar.gz: 450304b1c5d2ab8d3a7cc43f8fe71d9aed02c5dab8f3288ba213b1d599351751da552a26e3a04ba11f368bad4262a8703dc1b6c11285df2d891b2795082759e1
6
+ metadata.gz: e1385fdeb0923ebe72068c529341314aabebe1123f161e636762c19ad3325518716c759f9eae5fdd4bc6f12892709a45fe2dc5381e6151e6472677800fb31865
7
+ data.tar.gz: 919fbc77f37f6d40b15726ae9d5f36b622fdbe37d2789f730da61b5dea61d24ec12f359b607b096dbc5f0afd936e30fb68f23ee50efdd6276e3aa3c9273d6ade
data/lib/ocrfile.rb CHANGED
@@ -28,7 +28,7 @@ class OCRFile
28
28
  end
29
29
  end
30
30
  rescue # Detect errors
31
- binding.pry
31
+ #binding.pry
32
32
  end
33
33
 
34
34
  return @text
@@ -36,17 +36,20 @@ class OCRFile
36
36
 
37
37
  # Check if file is pdf
38
38
  def is_pdf?
39
+ puts "determined: is_pdf"
39
40
  file_start = File.open(@path, 'r') { |f| f.read(8)}
40
41
  file_start.match(/\%PDF-\d+\.?\d+/)
41
42
  end
42
43
 
43
44
  # Load text that is already extracted
44
45
  def load_extracted_text(file)
46
+ puts "file exists: load_extracted_text"
45
47
  @text = JSON.parse(File.read(file))["text"]
46
48
  end
47
49
 
48
50
  # Send file to give me text
49
51
  def give_me_text
52
+ puts "using: give_me_text"
50
53
  c = Curl::Easy.new("http://givemetext.okfnlabs.org/tika/tika/form")
51
54
  c.multipart_form_post = true
52
55
  c.http_post(Curl::PostField.file('file', @path))
@@ -56,6 +59,7 @@ class OCRFile
56
59
  end
57
60
 
58
61
  def give_me_text_local
62
+ puts "using: give_me_text_local"
59
63
  c = Curl::Easy.new(@tika + "/tika")
60
64
  # TODO: move this mime filtering to a higher global level
61
65
  mime_magic = MimeMagic.by_path(@path)
@@ -65,7 +69,6 @@ class OCRFile
65
69
  c.http_put(file_data)
66
70
 
67
71
  #binding.pry
68
-
69
72
  @text = c.body_str
70
73
  gotten_text_ok?(@text)
71
74
  end
@@ -77,6 +80,7 @@ class OCRFile
77
80
 
78
81
  # OCR with tesseract
79
82
  def ocr_pdf
83
+ puts "using: ocr_pdf"
80
84
  # Dir_paths
81
85
  base = Dir.pwd+"/"
82
86
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: parsefile
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.7
4
+ version: 0.0.8
5
5
  platform: ruby
6
6
  authors:
7
7
  - M. C. McGrath
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2016-10-10 00:00:00.000000000 Z
12
+ date: 2017-01-19 00:00:00.000000000 Z
13
13
  dependencies: []
14
14
  description: OCR file and extract metadata using Apache Tika and Tesseract
15
15
  email: shidash@shidash.com