gdor-indexer 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 5e1bc32ca2df58b2018dbc9f27aa2ce897cde0e4
4
+ data.tar.gz: c9a3378b959896f6007a7bdbb187069102a7897c
5
+ SHA512:
6
+ metadata.gz: c401ef958ff8bd02ce867cc066cbf9f40d6127bbc4d4916a79e08d158d694ea2ad4c27e633e2a0f27b8dccf5c623259b4f23df779692d0551f58979040ff0cf6
7
+ data.tar.gz: f0a797d7d3df7bfbdd4daa75d19a215682eb08ca885c437341a7c02b02e6bfca0b35fdc165d9116cbfb0f9b6eec1cd89de84501a724d1c5b53ca0d4969bc98c7
data/.gitignore ADDED
@@ -0,0 +1,31 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ .travis
7
+ .rvmrc
8
+ .ruby-version
9
+ Gemfile.lock
10
+ InstalledFiles
11
+ _yardoc
12
+ coverage
13
+ doc/
14
+ solrmarc-sw
15
+ solrmarc.log*
16
+ lib/bundler/man
17
+ pkg
18
+ rdoc
19
+ spec/reports
20
+ spec/test_logs
21
+ test/tmp
22
+ test/version_tmp
23
+ tmp
24
+ logs
25
+ .DS_Store
26
+ *.tmproj
27
+ tmtags
28
+ .idea/*
29
+ config/solr.yml
30
+ config/dor-fetcher-client.yml
31
+ config/collections
data/.hound.yml ADDED
@@ -0,0 +1,2 @@
1
+ ruby:
2
+ config_file: .rubocop.yml
data/.rubocop.yml ADDED
@@ -0,0 +1,3 @@
1
+ require: rubocop-rspec
2
+
3
+ inherit_from: .rubocop_todo.yml
data/.rubocop_todo.yml ADDED
@@ -0,0 +1,131 @@
1
+ # This configuration was generated by
2
+ # `rubocop --auto-gen-config`
3
+ # on 2015-10-26 16:04:23 -0700 using RuboCop version 0.34.2.
4
+ # The point is for the user to remove these configuration records
5
+ # one by one as the offenses are removed from the code base.
6
+ # Note that changes in the inspected code, or installation of new
7
+ # versions of RuboCop, may require this file to be generated again.
8
+
9
+ # Offense count: 6
10
+ Lint/AmbiguousRegexpLiteral:
11
+ Exclude:
12
+ - 'spec/unit/indexer_spec.rb'
13
+
14
+ # Offense count: 1
15
+ # Cop supports --auto-correct.
16
+ # Configuration parameters: AlignWith, SupportedStyles, AutoCorrect.
17
+ Lint/EndAlignment:
18
+ Enabled: false
19
+
20
+ # Offense count: 6
21
+ Lint/UselessAssignment:
22
+ Exclude:
23
+ - 'lib/gdor/indexer.rb'
24
+ - 'spec/unit/gdor_mods_fields_spec.rb'
25
+ - 'spec/unit/indexer_spec.rb'
26
+ - 'spec/unit/solr_doc_builder_spec.rb'
27
+
28
+ # Offense count: 1
29
+ Lint/Void:
30
+ Exclude:
31
+ - 'spec/unit/gdor_mods_fields_spec.rb'
32
+
33
+ # Offense count: 16
34
+ Metrics/AbcSize:
35
+ Max: 82
36
+
37
+ # Offense count: 1
38
+ # Configuration parameters: CountComments.
39
+ Metrics/ClassLength:
40
+ Max: 233
41
+
42
+ # Offense count: 4
43
+ Metrics/CyclomaticComplexity:
44
+ Max: 9
45
+
46
+ # Offense count: 304
47
+ # Configuration parameters: AllowURI, URISchemes.
48
+ Metrics/LineLength:
49
+ Max: 258
50
+
51
+ # Offense count: 15
52
+ # Configuration parameters: CountComments.
53
+ Metrics/MethodLength:
54
+ Max: 45
55
+
56
+ # Offense count: 3
57
+ Metrics/PerceivedComplexity:
58
+ Max: 10
59
+
60
+ # Offense count: 5
61
+ # Configuration parameters: CustomTransform.
62
+ RSpec/FilePath:
63
+ Exclude:
64
+ - 'spec/unit/gdor_mods_fields_spec.rb'
65
+ - 'spec/unit/indexer_spec.rb'
66
+ - 'spec/unit/public_xml_fields_spec.rb'
67
+ - 'spec/unit/solr_doc_builder_spec.rb'
68
+ - 'spec/unit/solr_doc_hash_spec.rb'
69
+
70
+ # Offense count: 356
71
+ RSpec/InstanceVariable:
72
+ Exclude:
73
+ - 'spec/unit/gdor_mods_fields_spec.rb'
74
+ - 'spec/unit/indexer_spec.rb'
75
+ - 'spec/unit/public_xml_fields_spec.rb'
76
+ - 'spec/unit/solr_doc_builder_spec.rb'
77
+
78
+ # Offense count: 7
79
+ # Cop supports --auto-correct.
80
+ # Configuration parameters: EnforcedStyle, SupportedStyles.
81
+ Style/BracesAroundHashParameters:
82
+ Exclude:
83
+ - 'spec/unit/indexer_spec.rb'
84
+
85
+ # Offense count: 4
86
+ # Configuration parameters: EnforcedStyle, SupportedStyles.
87
+ Style/ClassAndModuleChildren:
88
+ Exclude:
89
+ - 'lib/gdor/indexer/mods_fields.rb'
90
+ - 'lib/gdor/indexer/public_xml_fields.rb'
91
+ - 'lib/gdor/indexer/solr_doc_builder.rb'
92
+ - 'lib/gdor/indexer/solr_doc_hash.rb'
93
+
94
+ # Offense count: 4
95
+ # Configuration parameters: Exclude.
96
+ Style/Documentation:
97
+ Exclude:
98
+ - 'lib/gdor/indexer.rb'
99
+ - 'lib/gdor/indexer/nokogiri_xml_node_mixin.rb'
100
+ - 'lib/gdor/indexer/solr_doc_hash.rb'
101
+ - 'lib/gdor/indexer/version.rb'
102
+
103
+ # Offense count: 1
104
+ Style/DoubleNegation:
105
+ Exclude:
106
+ - 'lib/gdor/indexer/solr_doc_hash.rb'
107
+
108
+ # Offense count: 3
109
+ # Configuration parameters: MinBodyLength.
110
+ Style/GuardClause:
111
+ Exclude:
112
+ - 'lib/gdor/indexer.rb'
113
+
114
+ # Offense count: 1
115
+ # Configuration parameters: NamePrefix, NamePrefixBlacklist.
116
+ Style/PredicateName:
117
+ Exclude:
118
+ - 'lib/gdor/indexer/mods_fields.rb'
119
+
120
+ # Offense count: 2
121
+ # Cop supports --auto-correct.
122
+ # Configuration parameters: EnforcedStyle, SupportedStyles, AllowInnerSlashes.
123
+ Style/RegexpLiteral:
124
+ Exclude:
125
+ - 'spec/unit/gdor_mods_fields_spec.rb'
126
+ - 'spec/unit/indexer_spec.rb'
127
+
128
+ # Offense count: 1
129
+ Style/UnlessElse:
130
+ Exclude:
131
+ - 'lib/gdor/indexer.rb'
data/.yardopts ADDED
@@ -0,0 +1,3 @@
1
+ --title 'Walters Indexer Documentation'
2
+ lib/**/*.rb -
3
+ README.rdoc
data/Capfile ADDED
@@ -0,0 +1,26 @@
1
+ # Load DSL and Setup Up Stages
2
+ require 'capistrano/setup'
3
+
4
+ # Includes default deployment tasks
5
+ require 'capistrano/deploy'
6
+
7
+ # Includes tasks from other gems included in your Gemfile
8
+ #
9
+ # For documentation on these, see for example:
10
+ #
11
+ # https://github.com/capistrano/rvm
12
+ # https://github.com/capistrano/rbenv
13
+ # https://github.com/capistrano/chruby
14
+ # https://github.com/capistrano/bundler
15
+ # https://github.com/capistrano/rails
16
+ #
17
+ # require 'capistrano/rbenv'
18
+ # require 'capistrano/chruby'
19
+ require 'capistrano/bundler'
20
+ # require 'capistrano/rails'
21
+ require 'capistrano/rvm' # gdor-indexer needs jruby until merge-manager
22
+
23
+ require 'dlss/capistrano'
24
+
25
+ # Loads custom tasks from `lib/capistrano/tasks' if you have any defined.
26
+ Dir.glob('lib/capistrano/tasks/*.cap').each { |r| import r }
data/Gemfile ADDED
@@ -0,0 +1,12 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in spotlight-dor-resources.gemspec
4
+ gemspec
5
+
6
+ group :deployment do
7
+ gem 'capistrano', '~> 3.2'
8
+ gem 'capistrano-bundler'
9
+ gem 'capistrano-rvm' # gdor-indexer used to need jruby for merged records
10
+ gem 'lyberteam-capistrano-devel'
11
+ gem 'rainbow' # for color output
12
+ end
data/LICENSE.txt ADDED
@@ -0,0 +1,5 @@
1
+ Copyright (c) 2014. The Board of Trustees of the Leland Stanford Junior University. All rights reserved.
2
+
3
+ Redistribution and use of this distribution in source and binary forms, with or without modification, are permitted provided that: The above copyright notice and this permission notice appear in all copies and supporting documentation; The name, identifiers, and trademarks of The Board of Trustees of the Leland Stanford Junior University are not used in advertising or publicity without the express prior written permission of The Board of Trustees of the Leland Stanford Junior University; Recipients acknowledge that this distribution is made available as a research courtesy, "as is", potentially with defects, without any obligation on the part of The Board of Trustees of the Leland Stanford Junior University to provide support, services, or repair;
4
+
5
+ THE BOARD OF TRUSTEES OF THE LELAND STANFORD JUNIOR UNIVERSITY DISCLAIMS ALL WARRANTIES, EXPRESS OR IMPLIED, WITH REGARD TO THIS SOFTWARE, INCLUDING WITHOUT LIMITATION ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, AND IN NO EVENT SHALL THE BOARD OF TRUSTEES OF THE LELAND STANFORD JUNIOR UNIVERSITY BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, TORT (INCLUDING NEGLIGENCE) OR STRICT LIABILITY, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,67 @@
1
+ [![Dependency Status](https://gemnasium.com/sul-dlss/gdor-indexer.svg)](https://gemnasium.com/sul-dlss/gdor-indexer) [![Gem Version](https://badge.fury.io/rb/gdor-indexer.svg)](http://badge.fury.io/rb/gdor-indexer)
2
+
3
+ # gdor-indexer
4
+
5
+ Code to harvest DOR druids via DOR Fetcher service, mods from PURL, and use it to index items into a Solr index, such as that for SearchWorks.
6
+
7
+ ## Prerequisites
8
+
9
+ 1. ruby
10
+ 2. bundler gem must be installed
11
+
12
+ ## Install steps for running locally
13
+
14
+ Add this line to your application's Gemfile:
15
+
16
+ gem 'harvestdor-indexer'
17
+
18
+ Then execute:
19
+
20
+ $ bundle
21
+
22
+ ## Configuration
23
+
24
+ #### Create a collections folder in the config directory:
25
+
26
+ $ cd /path/to/gdor-indexer/config
27
+ $ mkdir collections
28
+
29
+ #### Create a yml config file for your collection(s) to be harvested and indexed.
30
+
31
+ See ```spec/config/walters_integration_spec.yml``` for an example. Copy that file to ```config/collections``` and change the following settings:
32
+
33
+ * whitelist
34
+ * dor_fetcher service_url
35
+ * harvestdor log_dir and log_name
36
+ * solr_url
37
+
38
+ ##### whitelist
39
+
40
+ The whitelist is how you specify which objects to index. The whitelist can be
41
+
42
+ * an Array of druids inline in the config yml file
43
+ * a filename containing a list of druids (one per line)
44
+
45
+ If a druid, per the object's identityMetadata at purl page, is for a
46
+
47
+ * collection record: then we process all the item druids in that collection (as if they were included individually in the whitelist)
48
+ * non-collection record: then we process the druid as an individual item
49
+
50
+ #### Run the indexer script
51
+
52
+ $ cd /path/to/gdor-indexer
53
+ $ nohup ./bin/indexer -c my_collection &>path/to/nohup.output
54
+
55
+ ## Running the tests
56
+
57
+ ```$ rake```
58
+
59
+ ## Contributing
60
+
61
+ * Fork it (https://help.github.com/articles/fork-a-repo/)
62
+ * Create your feature branch (`git checkout -b my-new-feature`)
63
+ * Write code and tests.
64
+ * Commit your changes (`git commit -am 'Added some feature'`)
65
+ * Push to the branch (`git push origin my-new-feature`)
66
+ * Create new Pull Request (https://help.github.com/articles/creating-a-pull-request/)
67
+
data/Rakefile ADDED
@@ -0,0 +1,57 @@
1
+ require 'rake'
2
+ require 'bundler'
3
+ require 'bundler/gem_tasks'
4
+
5
+ begin
6
+ Bundler.setup(:default, :development)
7
+ rescue Bundler::BundlerError => e
8
+ $stderr.puts e.message
9
+ $stderr.puts 'Run `bundle install` to install missing gems'
10
+ exit e.status_code
11
+ end
12
+
13
+ # add tasks defined in lib/tasks
14
+ # Dir.glob('lib/tasks/*.rake').each { |r| import r }
15
+
16
+ # desc 'Open an irb session preloaded with this library'
17
+ # task :console do
18
+ # sh 'irb -rubygems -I lib -r ./frda_indexer.rb'
19
+ # end
20
+
21
+ task default: [:ci, :rubocop]
22
+
23
+ desc 'run continuous integration suite (tests, coverage, docs)'
24
+ task ci: [:rspec, :doc, :rubocop]
25
+
26
+ task spec: :rspec
27
+
28
+ require 'rspec/core/rake_task'
29
+ RSpec::Core::RakeTask.new(:rspec) do |spec|
30
+ spec.rspec_opts = ['-c', '-f progress', '--tty', '-r ./spec/spec_helper.rb']
31
+ end
32
+
33
+ RSpec::Core::RakeTask.new(:rspec_wip) do |spec|
34
+ spec.rspec_opts = ['-c', '-f d', '--tty', '-r ./spec/spec_helper.rb', '-t wip']
35
+ end
36
+
37
+ require 'rubocop/rake_task'
38
+ RuboCop::RakeTask.new(:rubocop)
39
+
40
+ # Use yard to build docs
41
+ require 'yard'
42
+ require 'yard/rake/yardoc_task'
43
+ begin
44
+ project_root = File.expand_path(File.dirname(__FILE__))
45
+ doc_dest_dir = File.join(project_root, 'doc')
46
+
47
+ YARD::Rake::YardocTask.new(:doc) do |yt|
48
+ yt.files = Dir.glob(File.join(project_root, 'lib', '**', '*.rb')) +
49
+ [File.join(project_root, 'README.rdoc')]
50
+ yt.options = ['--output-dir', doc_dest_dir, '--readme', 'README.rdoc', '--title', 'Gryphondor Indexer Documentation']
51
+ end
52
+ rescue LoadError
53
+ desc 'Generate YARD Documentation'
54
+ task :doc do
55
+ abort 'Please install the YARD gem to generate rdoc.'
56
+ end
57
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 1.0.0
data/bin/indexer ADDED
@@ -0,0 +1,71 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # Look in the lib directory for .rb files
4
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
5
+
6
+ require 'rubygems'
7
+ require 'bundler/setup'
8
+ require 'trollop'
9
+ require 'gdor/indexer'
10
+
11
+ # Define the collections that this script knows how to harvest per yml files in the config/collections directory
12
+ # @return Array
13
+ def collections
14
+ config_dir = File.join(File.dirname(__FILE__), '..', 'config', 'collections')
15
+ files = Dir.glob("#{config_dir}/*.yml")
16
+ files.map! { |x| File.basename(x, '.yml') }
17
+ end
18
+
19
+ # use Trollop to declare options and help text.
20
+ @opts = Trollop.options do
21
+ version 'indexer v0.2.0 (c) 2012-2014 Stanford University http://searchworks.stanford.edu'
22
+ banner <<-EOM
23
+ The indexer script harvests items in a collection from DOR Fetcher Service,
24
+ harvests MODS from PURL pages, then
25
+ maps them to Solr document hashes appropriate for SearchWorks, and writes them to Solr
26
+ Usage:
27
+ ./indexer [options]
28
+ where [options] are:
29
+ EOM
30
+
31
+ opt :collection,
32
+ 'Index a given collection. Possible values are #{collections.sort}',
33
+ default: nil,
34
+ short: 'c',
35
+ type: String,
36
+ multi: true
37
+ opt :nocommit,
38
+ "Index the collection but don't commit",
39
+ default: nil,
40
+ short: 'n'
41
+ opt :all, 'Index all collections'
42
+ end
43
+
44
+ # Trollop::die :collection, "must be a known collection. Possible values are: #{collections.inspect} You entered #{@opts[:collection]}" unless collections.include?@opts[:collection]
45
+
46
+ def time
47
+ start = Time.now.getlocal
48
+ yield
49
+ elapsed = Time.now.getlocal - start
50
+ puts "This set took #{elapsed} seconds to run."
51
+ end
52
+
53
+ # Get all of the specified collections (from the command line) into an Array
54
+ @collections = []
55
+ if @opts[:collection].instance_of? Array
56
+ @collections = @opts[:collection]
57
+ else
58
+ @collections << @opts[:collection]
59
+ end
60
+
61
+ # process each collection from the command line
62
+ @collections.each do |coll|
63
+ config_yml_path = File.join(File.dirname(__FILE__), '..', 'config', 'collections', "#{coll}.yml")
64
+ @indexer = GDor::Indexer.new(config_yml_path)
65
+ puts "Indexing into Solr server #{@indexer.config[:solr][:url]}"
66
+ puts "Logging output to #{@indexer.config[:harvestdor][:log_dir]}/#{@indexer.config[:harvestdor][:log_name]}"
67
+
68
+ time do
69
+ @indexer.harvest_and_index
70
+ end
71
+ end
data/config/deploy.rb ADDED
@@ -0,0 +1,31 @@
1
+ set :application, 'gdor-indexer'
2
+ set :repo_url, 'https://github.com/sul-dlss/gdor-indexer.git'
3
+
4
+ # Default branch is :master
5
+ # ask :branch, proc { `git rev-parse --abbrev-ref HEAD`.chomp }
6
+
7
+ # gdor-indexer needs jruby until merge-manager
8
+ set :rvm_ruby_version, 'jruby-1.7.10'
9
+
10
+ set :user, 'lyberadmin'
11
+ set :deploy_to, "/home/#{fetch(:user)}/#{fetch(:application)}"
12
+
13
+ set :linked_dirs, %w(logs config/collections tmp solrmarc-sw)
14
+ set :linked_files, %w(.ruby-version config/solr.yml bin/index-prod-image.sh bin/index-prod-hydrus.sh config/dor-fetcher-client.yml)
15
+
16
+ set :stages, %w(dev stage prod fetcher)
17
+
18
+ # Default value for :log_level is :debug
19
+ set :log_level, :info
20
+
21
+ # Default value for :format is :pretty
22
+ # set :format, :pretty
23
+
24
+ # Default value for :pty is false
25
+ # set :pty, true
26
+
27
+ # Default value for default_env is {}
28
+ # set :default_env, { path: "/opt/ruby/bin:$PATH" }
29
+
30
+ # Default value for keep_releases is 5
31
+ set :keep_releases, 10