RubyGems - docpdftotext - Versions diffs - 0.0.2 → 0.0.3 - Mend

docpdftotext 0.0.2 → 0.0.3

Files changed (13) hide show

data/.document ADDED

@@ -0,0 +1,5 @@
+README.rdoc
+lib/**/*.rb
+bin/*
+features/**/*.feature
+LICENSE

data/.gitignore ADDED

@@ -0,0 +1,17 @@
+log/*
+tmp/**/*
+doc/api
+doc/app
+doc/plugins
+*~
+config/keys
+/public/images/Thumbs.db
+Thumbs.db
+public/system
+public/demos
+*.sw?
+.DS_Store
+coverage
+rdoc
+pkg
+*~

data/README.rdoc CHANGED

@@ -9,7 +9,7 @@ This gem enables you to interact with document conversion libraries through Rail
 == Requirements
  * Antiword: http://www.winfield.demon.nl/
- * pdf-reader: http://github.com/yob/pdf-reader
+ * pdftotext: http://packages.ubuntu.com/hardy/poppler-utils
  * OdfConverter: http://www.oooninja.com/2008/01/convert-openxml-docx-etc-in-linux-using.html
  * Openoffice-headless: http://wiki.alfresco.com/wiki/Running_OpenOffice_From_Terminal
  * DocumentConverter.py (included): http://artofsolving.com/opensource/pyodconverter

data/Rakefile ADDED

@@ -0,0 +1,57 @@
+require 'rubygems'
+require 'rake'
+begin
+  require 'jeweler'
+  Jeweler::Tasks.new do |gem|
+    gem.name = "docpdftotext"
+    gem.summary = %Q{Convert word to text in ruby}
+    gem.description = %Q{wrappers for libraries to convert documents into text}
+    gem.email = "eric@ericsilverberg.com"
+    gem.homepage = "http://github.com/esilverberg/docpdftotext"
+    gem.authors = ["esilverberg"]
+    gem.add_development_dependency "thoughtbot-shoulda"
+    # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
+  end
+  Jeweler::GemcutterTasks.new
+rescue LoadError
+  puts "Jeweler (or a dependency) not available. Install it with: sudo gem install jeweler"
+end
+require 'rake/testtask'
+Rake::TestTask.new(:test) do |test|
+  test.libs << 'lib' << 'test'
+  test.pattern = 'test/**/*_test.rb'
+  test.verbose = true
+end
+begin
+  require 'rcov/rcovtask'
+  Rcov::RcovTask.new do |test|
+    test.libs << 'test'
+    test.pattern = 'test/**/*_test.rb'
+    test.verbose = true
+  end
+rescue LoadError
+  task :rcov do
+    abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
+  end
+end
+task :test => :check_dependencies
+task :default => :test
+require 'rake/rdoctask'
+Rake::RDocTask.new do |rdoc|
+  if File.exist?('VERSION')
+    version = File.read('VERSION')
+  else
+    version = ""
+  end
+  rdoc.rdoc_dir = 'rdoc'
+  rdoc.title = "antiword #{version}"
+  rdoc.rdoc_files.include('README*')
+  rdoc.rdoc_files.include('lib/**/*.rb')
+end

data/VERSION ADDED

	@@ -0,0 +1 @@
1	+ 0.0.3

data/docpdftotext.gemspec ADDED

@@ -0,0 +1,57 @@
+# Generated by jeweler
+# DO NOT EDIT THIS FILE
+# Instead, edit Jeweler::Tasks in Rakefile, and run `rake gemspec`
+# -*- encoding: utf-8 -*-
+Gem::Specification.new do |s|
+  s.name = %q{docpdftotext}
+  s.version = "0.0.3"
+  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
+  s.authors = ["esilverberg"]
+  s.date = %q{2009-11-23}
+  s.description = %q{wrappers for libraries to convert documents into text}
+  s.email = %q{eric@ericsilverberg.com}
+  s.extra_rdoc_files = [
+    "LICENSE",
+     "README.rdoc"
+  ]
+  s.files = [
+    ".document",
+     ".gitignore",
+     "LICENSE",
+     "README.rdoc",
+     "Rakefile",
+     "VERSION",
+     "docpdftotext.gemspec",
+     "lib/DocumentConverter.py",
+     "lib/docpdftotext.rb",
+     "test/docpdftotext_test.rb",
+     "test/test.doc",
+     "test/test.docx",
+     "test/test.pdf",
+     "test/test_helper.rb"
+  ]
+  s.homepage = %q{http://github.com/esilverberg/docpdftotext}
+  s.rdoc_options = ["--charset=UTF-8"]
+  s.require_paths = ["lib"]
+  s.rubygems_version = %q{1.3.5}
+  s.summary = %q{Convert word to text in ruby}
+  s.test_files = [
+    "test/docpdftotext_test.rb",
+     "test/test_helper.rb"
+  ]
+  if s.respond_to? :specification_version then
+    current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
+    s.specification_version = 3
+    if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
+      s.add_development_dependency(%q<thoughtbot-shoulda>, [">= 0"])
+    else
+      s.add_dependency(%q<thoughtbot-shoulda>, [">= 0"])
+    end
+  else
+    s.add_dependency(%q<thoughtbot-shoulda>, [">= 0"])
+  end
+end

data/lib/DocumentConverter.py ADDED

@@ -0,0 +1,231 @@
+#
+# PyODConverter (Python OpenDocument Converter) v1.1 - 2009-11-14
+#
+# This script converts a document from one office format to another by
+# connecting to an OpenOffice.org instance via Python-UNO bridge.
+#
+# Copyright (C) 2008-2009 Mirko Nasato <mirko@artofsolving.com>
+# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl-2.1.html
+# - or any later version.
+#
+DEFAULT_OPENOFFICE_PORT = 8100
+import uno
+from os.path import abspath, isfile, splitext
+from com.sun.star.beans import PropertyValue
+from com.sun.star.task import ErrorCodeIOException
+from com.sun.star.connection import NoConnectException
+FAMILY_TEXT = "Text"
+FAMILY_WEB = "Web"
+FAMILY_SPREADSHEET = "Spreadsheet"
+FAMILY_PRESENTATION = "Presentation"
+FAMILY_DRAWING = "Drawing"
+#---------------------#
+# Configuration Start #
+#---------------------#
+# see http://wiki.services.openoffice.org/wiki/Framework/Article/Filter
+# most formats are auto-detected; only those requiring options are defined here
+IMPORT_FILTER_MAP = {
+    "txt": {
+        "FilterName": "Text (encoded)",
+        "FilterOptions": "utf8"
+    },
+    "csv": {
+        "FilterName": "Text - txt - csv (StarCalc)",
+        "FilterOptions": "44,34,0"
+    }
+}
+EXPORT_FILTER_MAP = {
+    "pdf": {
+        FAMILY_TEXT: { "FilterName": "writer_pdf_Export" },
+        FAMILY_WEB: { "FilterName": "writer_web_pdf_Export" },
+        FAMILY_SPREADSHEET: { "FilterName": "calc_pdf_Export" },
+        FAMILY_PRESENTATION: { "FilterName": "impress_pdf_Export" },
+        FAMILY_DRAWING: { "FilterName": "draw_pdf_Export" }
+    },
+    "html": {
+        FAMILY_TEXT: { "FilterName": "HTML (StarWriter)" },
+        FAMILY_SPREADSHEET: { "FilterName": "HTML (StarCalc)" },
+        FAMILY_PRESENTATION: { "FilterName": "impress_html_Export" }
+    },
+    "odt": {
+        FAMILY_TEXT: { "FilterName": "writer8" },
+        FAMILY_WEB: { "FilterName": "writerweb8_writer" }
+    },
+    "doc": {
+        FAMILY_TEXT: { "FilterName": "MS Word 97" }
+    },
+    "rtf": {
+        FAMILY_TEXT: { "FilterName": "Rich Text Format" }
+    },
+    "txt": {
+        FAMILY_TEXT: {
+            "FilterName": "Text",
+            "FilterOptions": "utf8"
+        }
+    },
+    "ods": {
+        FAMILY_SPREADSHEET: { "FilterName": "calc8" }
+    },
+    "xls": {
+        FAMILY_SPREADSHEET: { "FilterName": "MS Excel 97" }
+    },
+    "csv": {
+        FAMILY_SPREADSHEET: {
+            "FilterName": "Text - txt - csv (StarCalc)",
+            "FilterOptions": "44,34,0"
+        }
+    },
+    "odp": {
+        FAMILY_PRESENTATION: { "FilterName": "impress8" }
+    },
+    "ppt": {
+        FAMILY_PRESENTATION: { "FilterName": "MS PowerPoint 97" }
+    },
+    "swf": {
+        FAMILY_DRAWING: { "FilterName": "draw_flash_Export" },
+        FAMILY_PRESENTATION: { "FilterName": "impress_flash_Export" }
+    }
+}
+PAGE_STYLE_OVERRIDE_PROPERTIES = {
+    FAMILY_SPREADSHEET: {
+        #--- Scale options: uncomment 1 of the 3 ---
+        # a) 'Reduce / enlarge printout': 'Scaling factor'
+        "PageScale": 100,
+        # b) 'Fit print range(s) to width / height': 'Width in pages' and 'Height in pages'
+        #"ScaleToPagesX": 1, "ScaleToPagesY": 1000,
+        # c) 'Fit print range(s) on number of pages': 'Fit print range(s) on number of pages'
+        #"ScaleToPages": 1,
+        "PrintGrid": False
+    }
+}
+#-------------------#
+# Configuration End #
+#-------------------#
+class DocumentConversionException(Exception):
+    def __init__(self, message):
+        self.message = message
+    def __str__(self):
+        return self.message
+class DocumentConverter:
+    def __init__(self, port=DEFAULT_OPENOFFICE_PORT):
+        localContext = uno.getComponentContext()
+        resolver = localContext.ServiceManager.createInstanceWithContext("com.sun.star.bridge.UnoUrlResolver", localContext)
+        try:
+            context = resolver.resolve("uno:socket,host=localhost,port=%s;urp;StarOffice.ComponentContext" % port)
+        except NoConnectException:
+            raise DocumentConversionException, "failed to connect to OpenOffice.org on port %s" % port
+        self.desktop = context.ServiceManager.createInstanceWithContext("com.sun.star.frame.Desktop", context)
+    def convert(self, inputFile, outputFile):
+        inputUrl = self._toFileUrl(inputFile)
+        outputUrl = self._toFileUrl(outputFile)
+        loadProperties = { "Hidden": True }
+        inputExt = self._getFileExt(inputFile)
+        if IMPORT_FILTER_MAP.has_key(inputExt):
+            loadProperties.update(IMPORT_FILTER_MAP[inputExt])
+        document = self.desktop.loadComponentFromURL(inputUrl, "_blank", 0, self._toProperties(loadProperties))
+        try:
+            document.refresh()
+        except AttributeError:
+            pass
+        family = self._detectFamily(document)
+        self._overridePageStyleProperties(document, family)
+        outputExt = self._getFileExt(outputFile)
+        storeProperties = self._getStoreProperties(document, outputExt)
+        try:
+            document.storeToURL(outputUrl, self._toProperties(storeProperties))
+        finally:
+            document.close(True)
+    def _overridePageStyleProperties(self, document, family):
+        if PAGE_STYLE_OVERRIDE_PROPERTIES.has_key(family):
+            properties = PAGE_STYLE_OVERRIDE_PROPERTIES[family]
+            pageStyles = document.getStyleFamilies().getByName('PageStyles')
+            for styleName in pageStyles.getElementNames():
+                pageStyle = pageStyles.getByName(styleName)
+                for name, value in properties.items():
+                    pageStyle.setPropertyValue(name, value)
+    def _getStoreProperties(self, document, outputExt):
+        family = self._detectFamily(document)
+        try:
+            propertiesByFamily = EXPORT_FILTER_MAP[outputExt]
+        except KeyError:
+            raise DocumentConversionException, "unknown output format: '%s'" % outputExt
+        try:
+            return propertiesByFamily[family]
+        except KeyError:
+            raise DocumentConversionException, "unsupported conversion: from '%s' to '%s'" % (family, outputExt)
+    def _detectFamily(self, document):
+        if document.supportsService("com.sun.star.text.WebDocument"):
+            return FAMILY_WEB
+        if document.supportsService("com.sun.star.text.GenericTextDocument"):
+            # must be TextDocument or GlobalDocument
+            return FAMILY_TEXT
+        if document.supportsService("com.sun.star.sheet.SpreadsheetDocument"):
+            return FAMILY_SPREADSHEET
+        if document.supportsService("com.sun.star.presentation.PresentationDocument"):
+            return FAMILY_PRESENTATION
+        if document.supportsService("com.sun.star.drawing.DrawingDocument"):
+            return FAMILY_DRAWING
+        raise DocumentConversionException, "unknown document family: %s" % document
+    def _getFileExt(self, path):
+        ext = splitext(path)[1]
+        if ext is not None:
+            return ext[1:].lower()
+    def _toFileUrl(self, path):
+        return uno.systemPathToFileUrl(abspath(path))
+    def _toProperties(self, dict):
+        props = []
+        for key in dict:
+            prop = PropertyValue()
+            prop.Name = key
+            prop.Value = dict[key]
+            props.append(prop)
+        return tuple(props)
+if __name__ == "__main__":
+    from sys import argv, exit
+    if len(argv) < 3:
+        print "USAGE: python %s <input-file> <output-file>" % argv[0]
+        exit(255)
+    if not isfile(argv[1]):
+        print "no such input file: %s" % argv[1]
+        exit(1)
+    try:
+        converter = DocumentConverter()
+        converter.convert(argv[1], argv[2])
+    except DocumentConversionException, exception:
+        print "ERROR! " + str(exception)
+        exit(1)
+    except ErrorCodeIOException, exception:
+        print "ERROR! ErrorCodeIOException %d" % exception.ErrCode
+        exit(1)

data/lib/docpdftotext.rb ADDED

@@ -0,0 +1,82 @@
+require 'tempfile'
+module DocPdfToText
+  VERSION = "1.0.0"
+  ANTIWORD_PATH  = "antiword"
+  ODF_CONVERTER_PATH = "OdfConverter"
+  PYTHON_PATH = "python"
+  DOC_CONVERTER_PATH =  File.join(File.dirname(__FILE__), "DocumentConverter.py")
+  PDFTOTEXT_PATH = "pdftotext"
+  def file_to_txt(file_path)
+    expanded_path = File.expand_path(file_path)
+    raise ArgumentError, "Unknown file" unless File.exists?(expanded_path)
+    return case File.extname(expanded_path)
+    when ".docx"
+      docx_to_txt(file_path)
+    when ".doc"
+      doc_to_txt(file_path)
+    when ".pdf"
+      pdf_to_txt(file_path)
+    when ".txt"
+      read_txt_file(file_path)
+    else
+      raise ArgumentError, "Invalid file type"
+    end
+  end
+  def docx_to_txt(file_path)
+    expanded_path = File.expand_path(file_path)
+    raise ArgumentError, "Unknown file" unless File.exists?(expanded_path)
+    raise ArgumentError, "Invalid file type" unless File.extname(expanded_path) == ".docx"
+    tmp_odt = Tempfile.new("docx")
+    tmp_odt_path = tmp_odt.path + ".odt"
+    tmp_odt.close # so our script can write to it; it isn't deleted till gc
+    cmd = "#{ODF_CONVERTER_PATH} /LEVEL 4 /I #{expanded_path} /O #{tmp_odt_path}"
+    `#{cmd}`
+    tmp_final = Tempfile.new("txt")
+    tmp_final_path = tmp_final.path + ".txt"
+    tmp_final.close
+    cmd = "#{PYTHON_PATH} #{DOC_CONVERTER_PATH} #{tmp_odt_path} #{tmp_final_path}"
+    `#{cmd}`
+    return read_txt_file(tmp_final_path)
+  end
+  def read_txt_file(file_path)
+    expanded_path = File.expand_path(file_path)
+    raise ArgumentError, "Unknown file" unless File.exists?(expanded_path)
+    final = []
+    File.open(expanded_path, "r") do |infile|
+      final.push(infile.gets)
+    end
+    return final.join("\n")
+  end
+  def doc_to_txt(file_path)
+    expanded_path = File.expand_path(file_path)
+    raise ArgumentError, "Unknown file" unless File.exists?(expanded_path)
+    raise ArgumentError, "Invalid file type" unless File.extname(expanded_path) == ".doc"
+    cmd = "#{ANTIWORD_PATH} #{expanded_path}"
+    return `#{cmd}`
+  end
+  def pdf_to_txt(file_path)
+    expanded_path = File.expand_path(file_path)
+    raise ArgumentError, "Unknown file" unless File.exists?(expanded_path)
+    raise ArgumentError, "Invalid file type" unless File.extname(expanded_path) == ".pdf"
+    tmp = Tempfile.new("pdf")
+    tmp_path = tmp.path
+    tmp.close # so our script can write to it; it isn't deleted till gc
+    cmd = "#{PDFTOTEXT_PATH} #{expanded_path} #{tmp_path}"
+    `#{cmd}`
+    return read_txt_file(tmp_path)
+  end
+end

data/test/docpdftotext_test.rb CHANGED

@@ -1,9 +1,13 @@
 require 'test_helper'
-require 'tempfile'
 class DocPdfToTextTest < Test::Unit::TestCase
   include DocPdfToText
+  should "Convert a pdf file" do
+    test_file = File.join(File.dirname(__FILE__), "test.pdf")
+    assert(file_to_txt(test_file).length > 0)
+  end
   should "Convert a docx file" do
     test_file = File.join(File.dirname(__FILE__), "test.docx")
     assert(file_to_txt(test_file).length > 0)
@@ -14,11 +18,6 @@ class DocPdfToTextTest < Test::Unit::TestCase
     assert(file_to_txt(test_file).length > 0)
   end
-  should "Convert a pdf file" do
-    test_file = File.join(File.dirname(__FILE__), "test.pdf")
-    assert(file_to_txt(test_file).length > 0)
-  end
   should "raise invalid file format" do
     assert_raise ArgumentError do
       test_file = File.join(File.dirname(__FILE__), "test.pdf")

data/test/test.doc ADDED

Binary file

data/test/test.docx ADDED

Binary file

data/test/test.pdf ADDED

Binary file

metadata CHANGED

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: docpdftotext
 version: !ruby/object:Gem::Version
-  version: 0.0.2
+  version: 0.0.3
 platform: ruby
 authors:
 - esilverberg
@@ -22,16 +22,6 @@ dependencies:
       - !ruby/object:Gem::Version
         version: "0"
     version:
-- !ruby/object:Gem::Dependency
-  name: pdf-reader
-  type: :runtime
-  version_requirement:
-  version_requirements: !ruby/object:Gem::Requirement
-    requirements:
-    - - ">="
-      - !ruby/object:Gem::Version
-        version: "0"
-    version:
 description: wrappers for libraries to convert documents into text
 email: eric@ericsilverberg.com
 executables: []
@@ -42,8 +32,20 @@ extra_rdoc_files:
 - LICENSE
 - README.rdoc
 files:
+- .document
+- .gitignore
 - LICENSE
 - README.rdoc
+- Rakefile
+- VERSION
+- docpdftotext.gemspec
+- lib/DocumentConverter.py
+- lib/docpdftotext.rb
+- test/docpdftotext_test.rb
+- test/test.doc
+- test/test.docx
+- test/test.pdf
+- test/test_helper.rb
 has_rdoc: true
 homepage: http://github.com/esilverberg/docpdftotext
 licenses: []
@@ -73,5 +75,5 @@ signing_key:
 specification_version: 3
 summary: Convert word to text in ruby
 test_files:
-- test/test_helper.rb
 - test/docpdftotext_test.rb
+- test/test_helper.rb