embulk-filter-crawler 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 4cb84672dc6d8f59c550c3faca25b051abe23209
4
+ data.tar.gz: 35fa411ca6985bf8815ef79f40c32865cc11b109
5
+ SHA512:
6
+ metadata.gz: a165c5351f4d4d929ec98f3b4d539cba692fbf6ed8173994a396381d58a02754ba9f36d9e36625ad4e0f2f55bfc31c1e547549fca195e7e726ede28cd7d8758e
7
+ data.tar.gz: c53e77f4f465f9da0d733a3b0db8b955e3b3c0c6ccf371a34f7487a4a98cf7052987f35974ada23824a1f72a31f1eebaa4351d9f32b54f9e4502e699e7625fba
data/.gitignore ADDED
@@ -0,0 +1,13 @@
1
+ *~
2
+ /pkg/
3
+ /tmp/
4
+ *.gemspec
5
+ .gradle/
6
+ /classpath/
7
+ build/
8
+ .idea
9
+ /.settings/
10
+ /.metadata/
11
+ .classpath
12
+ .project
13
+ /bin/
data/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
1
+
2
+ MIT License
3
+
4
+ Permission is hereby granted, free of charge, to any person obtaining
5
+ a copy of this software and associated documentation files (the
6
+ "Software"), to deal in the Software without restriction, including
7
+ without limitation the rights to use, copy, modify, merge, publish,
8
+ distribute, sublicense, and/or sell copies of the Software, and to
9
+ permit persons to whom the Software is furnished to do so, subject to
10
+ the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be
13
+ included in all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
19
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
20
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
21
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,49 @@
1
+ # Crawler filter plugin for Embulk
2
+
3
+ Write short description here and build.gradle file.
4
+
5
+ ## Overview
6
+
7
+ * **Plugin type**: filter
8
+
9
+ ## Configuration
10
+
11
+ - **target_key**: base_url column key name (string, require)
12
+ - **max_depth_of_crawling**: max depth of crawling (integer, default: unlimited)
13
+ - **seed_size**: seed_size (string, default: `"myvalue"`)
14
+ - **number_of_crawlers**: parallelism (string, default: 1)
15
+ - **max_pages_to_fetch**: max_pages_to_fetch (string, default: unlimited)
16
+ - **crawl_storage_folder**: crawl_storage_folder (string, require)
17
+ - **politeness_delay**: politeness_delay (integer, default: null)
18
+ - **user_agent_string**: user_agent_string (string, default: null)
19
+ - **keep_input**: keep_input (string, default: `"myvalue"`)
20
+
21
+ ## Example
22
+
23
+ ```yaml
24
+ in:
25
+ type: mysql
26
+ host: dbs04
27
+ user: application
28
+ password: XXXXXXXX
29
+ database: iap
30
+ query: |
31
+ select url from companies limit 100
32
+ filters:
33
+ - type: crawler
34
+ target_key: url
35
+ number_of_crawlers: 10
36
+ seed_size: 100
37
+ max_depth_of_crawling: 4
38
+ politeness_delay: 100
39
+ crawl_storage_folder: "/tmp/crawl/%s"
40
+ out:
41
+ type: stdout
42
+ ```
43
+
44
+
45
+ ## Build
46
+
47
+ ```
48
+ $ ./gradlew gem # -t to watch change of files and rebuild continuously
49
+ ```
data/build.gradle ADDED
@@ -0,0 +1,94 @@
1
+ plugins {
2
+ id "com.jfrog.bintray" version "1.1"
3
+ id "com.github.jruby-gradle.base" version "0.1.5"
4
+ id "java"
5
+ id "checkstyle"
6
+ id "eclipse"
7
+ }
8
+ import com.github.jrubygradle.JRubyExec
9
+ repositories {
10
+ mavenCentral()
11
+ jcenter()
12
+ }
13
+ configurations {
14
+ provided
15
+ }
16
+
17
+ version = "0.1.0"
18
+
19
+ sourceCompatibility = 1.7
20
+ targetCompatibility = 1.7
21
+
22
+ dependencies {
23
+ compile "org.embulk:embulk-core:0.8.3"
24
+ compile "edu.uci.ics:crawler4j:4.2"
25
+ provided "org.embulk:embulk-core:0.8.3"
26
+ testCompile "junit:junit:4.+"
27
+ }
28
+
29
+ task classpath(type: Copy, dependsOn: ["jar"]) {
30
+ doFirst { file("classpath").deleteDir() }
31
+ from (configurations.runtime - configurations.provided + files(jar.archivePath))
32
+ into "classpath"
33
+ }
34
+ clean { delete "classpath" }
35
+
36
+ checkstyle {
37
+ configFile = file("${project.rootDir}/config/checkstyle/checkstyle.xml")
38
+ toolVersion = '6.14.1'
39
+ }
40
+ checkstyleMain {
41
+ configFile = file("${project.rootDir}/config/checkstyle/default.xml")
42
+ ignoreFailures = true
43
+ }
44
+ checkstyleTest {
45
+ configFile = file("${project.rootDir}/config/checkstyle/default.xml")
46
+ ignoreFailures = true
47
+ }
48
+ task checkstyle(type: Checkstyle) {
49
+ classpath = sourceSets.main.output + sourceSets.test.output
50
+ source = sourceSets.main.allJava + sourceSets.test.allJava
51
+ }
52
+
53
+ task gem(type: JRubyExec, dependsOn: ["gemspec", "classpath"]) {
54
+ jrubyArgs "-rrubygems/gem_runner", "-eGem::GemRunner.new.run(ARGV)", "build"
55
+ script "${project.name}.gemspec"
56
+ doLast { ant.move(file: "${project.name}-${project.version}.gem", todir: "pkg") }
57
+ }
58
+
59
+ task gemPush(type: JRubyExec, dependsOn: ["gem"]) {
60
+ jrubyArgs "-rrubygems/gem_runner", "-eGem::GemRunner.new.run(ARGV)", "push"
61
+ script "pkg/${project.name}-${project.version}.gem"
62
+ }
63
+
64
+ task "package"(dependsOn: ["gemspec", "classpath"]) << {
65
+ println "> Build succeeded."
66
+ println "> You can run embulk with '-L ${file(".").absolutePath}' argument."
67
+ }
68
+
69
+ task gemspec {
70
+ ext.gemspecFile = file("${project.name}.gemspec")
71
+ inputs.file "build.gradle"
72
+ outputs.file gemspecFile
73
+ doLast { gemspecFile.write($/
74
+ Gem::Specification.new do |spec|
75
+ spec.name = "${project.name}"
76
+ spec.version = "${project.version}"
77
+ spec.authors = ["toyama0919"]
78
+ spec.summary = %[Crawler4J filter plugin for Embulk]
79
+ spec.description = %[Crawler4J filter plugin for Embulk]
80
+ spec.email = ["toyama0919@gmail.com"]
81
+ spec.licenses = ["MIT"]
82
+ spec.homepage = "https://github.com/toyama0919/embulk-filter-crawler"
83
+
84
+ spec.files = `git ls-files`.split("\n") + Dir["classpath/*.jar"]
85
+ spec.test_files = spec.files.grep(%r"^(test|spec)/")
86
+ spec.require_paths = ["lib"]
87
+
88
+ spec.add_development_dependency 'bundler', ['~> 1.0']
89
+ spec.add_development_dependency 'rake', ['>= 10.0']
90
+ end
91
+ /$)
92
+ }
93
+ }
94
+ clean { delete "${project.name}.gemspec" }
@@ -0,0 +1,128 @@
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <!DOCTYPE module PUBLIC
3
+ "-//Puppy Crawl//DTD Check Configuration 1.3//EN"
4
+ "http://www.puppycrawl.com/dtds/configuration_1_3.dtd">
5
+ <module name="Checker">
6
+ <!-- https://github.com/facebook/presto/blob/master/src/checkstyle/checks.xml -->
7
+ <module name="FileTabCharacter"/>
8
+ <module name="NewlineAtEndOfFile">
9
+ <property name="lineSeparator" value="lf"/>
10
+ </module>
11
+ <module name="RegexpMultiline">
12
+ <property name="format" value="\r"/>
13
+ <property name="message" value="Line contains carriage return"/>
14
+ </module>
15
+ <module name="RegexpMultiline">
16
+ <property name="format" value=" \n"/>
17
+ <property name="message" value="Line has trailing whitespace"/>
18
+ </module>
19
+ <module name="RegexpMultiline">
20
+ <property name="format" value="\{\n\n"/>
21
+ <property name="message" value="Blank line after opening brace"/>
22
+ </module>
23
+ <module name="RegexpMultiline">
24
+ <property name="format" value="\n\n\s*\}"/>
25
+ <property name="message" value="Blank line before closing brace"/>
26
+ </module>
27
+ <module name="RegexpMultiline">
28
+ <property name="format" value="\n\n\n"/>
29
+ <property name="message" value="Multiple consecutive blank lines"/>
30
+ </module>
31
+ <module name="RegexpMultiline">
32
+ <property name="format" value="\n\n\Z"/>
33
+ <property name="message" value="Blank line before end of file"/>
34
+ </module>
35
+ <module name="RegexpMultiline">
36
+ <property name="format" value="Preconditions\.checkNotNull"/>
37
+ <property name="message" value="Use of checkNotNull"/>
38
+ </module>
39
+
40
+ <module name="TreeWalker">
41
+ <module name="EmptyBlock">
42
+ <property name="option" value="text"/>
43
+ <property name="tokens" value="
44
+ LITERAL_DO, LITERAL_ELSE, LITERAL_FINALLY, LITERAL_IF,
45
+ LITERAL_FOR, LITERAL_TRY, LITERAL_WHILE, INSTANCE_INIT, STATIC_INIT"/>
46
+ </module>
47
+ <module name="EmptyStatement"/>
48
+ <module name="EmptyForInitializerPad"/>
49
+ <module name="EmptyForIteratorPad">
50
+ <property name="option" value="space"/>
51
+ </module>
52
+ <module name="MethodParamPad">
53
+ <property name="allowLineBreaks" value="true"/>
54
+ <property name="option" value="nospace"/>
55
+ </module>
56
+ <module name="ParenPad"/>
57
+ <module name="TypecastParenPad"/>
58
+ <module name="NeedBraces"/>
59
+ <module name="LeftCurly">
60
+ <property name="option" value="nl"/>
61
+ <property name="tokens" value="CLASS_DEF, CTOR_DEF, INTERFACE_DEF, METHOD_DEF"/>
62
+ </module>
63
+ <module name="LeftCurly">
64
+ <property name="option" value="eol"/>
65
+ <property name="tokens" value="
66
+ LITERAL_CATCH, LITERAL_DO, LITERAL_ELSE, LITERAL_FINALLY, LITERAL_FOR,
67
+ LITERAL_IF, LITERAL_SWITCH, LITERAL_SYNCHRONIZED, LITERAL_TRY, LITERAL_WHILE"/>
68
+ </module>
69
+ <module name="RightCurly">
70
+ <property name="option" value="alone"/>
71
+ </module>
72
+ <module name="GenericWhitespace"/>
73
+ <module name="WhitespaceAfter"/>
74
+ <module name="NoWhitespaceBefore"/>
75
+
76
+ <module name="UpperEll"/>
77
+ <module name="DefaultComesLast"/>
78
+ <module name="ArrayTypeStyle"/>
79
+ <module name="MultipleVariableDeclarations"/>
80
+ <module name="ModifierOrder"/>
81
+ <module name="OneStatementPerLine"/>
82
+ <module name="StringLiteralEquality"/>
83
+ <module name="MutableException"/>
84
+ <module name="EqualsHashCode"/>
85
+ <module name="InnerAssignment"/>
86
+ <module name="InterfaceIsType"/>
87
+ <module name="HideUtilityClassConstructor"/>
88
+
89
+ <module name="MemberName"/>
90
+ <module name="LocalVariableName"/>
91
+ <module name="LocalFinalVariableName"/>
92
+ <module name="TypeName"/>
93
+ <module name="PackageName"/>
94
+ <module name="ParameterName"/>
95
+ <module name="StaticVariableName"/>
96
+ <module name="ClassTypeParameterName">
97
+ <property name="format" value="^[A-Z][0-9]?$"/>
98
+ </module>
99
+ <module name="MethodTypeParameterName">
100
+ <property name="format" value="^[A-Z][0-9]?$"/>
101
+ </module>
102
+
103
+ <module name="AvoidStarImport"/>
104
+ <module name="RedundantImport"/>
105
+ <module name="UnusedImports"/>
106
+ <module name="ImportOrder">
107
+ <property name="groups" value="*,javax,java"/>
108
+ <property name="separated" value="true"/>
109
+ <property name="option" value="bottom"/>
110
+ <property name="sortStaticImportsAlphabetically" value="true"/>
111
+ </module>
112
+
113
+ <module name="WhitespaceAround">
114
+ <property name="allowEmptyConstructors" value="true"/>
115
+ <property name="allowEmptyMethods" value="true"/>
116
+ <property name="ignoreEnhancedForColon" value="false"/>
117
+ <property name="tokens" value="
118
+ ASSIGN, BAND, BAND_ASSIGN, BOR, BOR_ASSIGN, BSR, BSR_ASSIGN,
119
+ BXOR, BXOR_ASSIGN, COLON, DIV, DIV_ASSIGN, EQUAL, GE, GT, LAND, LE,
120
+ LITERAL_ASSERT, LITERAL_CATCH, LITERAL_DO, LITERAL_ELSE,
121
+ LITERAL_FINALLY, LITERAL_FOR, LITERAL_IF, LITERAL_RETURN,
122
+ LITERAL_SYNCHRONIZED, LITERAL_TRY, LITERAL_WHILE,
123
+ LOR, LT, MINUS, MINUS_ASSIGN, MOD, MOD_ASSIGN, NOT_EQUAL,
124
+ PLUS, PLUS_ASSIGN, QUESTION, SL, SLIST, SL_ASSIGN, SR, SR_ASSIGN,
125
+ STAR, STAR_ASSIGN, TYPE_EXTENSION_AND"/>
126
+ </module>
127
+ </module>
128
+ </module>
@@ -0,0 +1,108 @@
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <!DOCTYPE module PUBLIC
3
+ "-//Puppy Crawl//DTD Check Configuration 1.3//EN"
4
+ "http://www.puppycrawl.com/dtds/configuration_1_3.dtd">
5
+ <!--
6
+ This is a subset of ./checkstyle.xml which allows some loose styles
7
+ -->
8
+ <module name="Checker">
9
+ <module name="FileTabCharacter"/>
10
+ <module name="NewlineAtEndOfFile">
11
+ <property name="lineSeparator" value="lf"/>
12
+ </module>
13
+ <module name="RegexpMultiline">
14
+ <property name="format" value="\r"/>
15
+ <property name="message" value="Line contains carriage return"/>
16
+ </module>
17
+ <module name="RegexpMultiline">
18
+ <property name="format" value=" \n"/>
19
+ <property name="message" value="Line has trailing whitespace"/>
20
+ </module>
21
+ <module name="RegexpMultiline">
22
+ <property name="format" value="\n\n\n"/>
23
+ <property name="message" value="Multiple consecutive blank lines"/>
24
+ </module>
25
+ <module name="RegexpMultiline">
26
+ <property name="format" value="\n\n\Z"/>
27
+ <property name="message" value="Blank line before end of file"/>
28
+ </module>
29
+
30
+ <module name="TreeWalker">
31
+ <module name="EmptyBlock">
32
+ <property name="option" value="text"/>
33
+ <property name="tokens" value="
34
+ LITERAL_DO, LITERAL_ELSE, LITERAL_FINALLY, LITERAL_IF,
35
+ LITERAL_FOR, LITERAL_TRY, LITERAL_WHILE, INSTANCE_INIT, STATIC_INIT"/>
36
+ </module>
37
+ <module name="EmptyStatement"/>
38
+ <module name="EmptyForInitializerPad"/>
39
+ <module name="EmptyForIteratorPad">
40
+ <property name="option" value="space"/>
41
+ </module>
42
+ <module name="MethodParamPad">
43
+ <property name="allowLineBreaks" value="true"/>
44
+ <property name="option" value="nospace"/>
45
+ </module>
46
+ <module name="ParenPad"/>
47
+ <module name="TypecastParenPad"/>
48
+ <module name="NeedBraces"/>
49
+ <module name="LeftCurly">
50
+ <property name="option" value="nl"/>
51
+ <property name="tokens" value="CLASS_DEF, CTOR_DEF, INTERFACE_DEF, METHOD_DEF"/>
52
+ </module>
53
+ <module name="LeftCurly">
54
+ <property name="option" value="eol"/>
55
+ <property name="tokens" value="
56
+ LITERAL_CATCH, LITERAL_DO, LITERAL_ELSE, LITERAL_FINALLY, LITERAL_FOR,
57
+ LITERAL_IF, LITERAL_SWITCH, LITERAL_SYNCHRONIZED, LITERAL_TRY, LITERAL_WHILE"/>
58
+ </module>
59
+ <module name="RightCurly">
60
+ <property name="option" value="alone"/>
61
+ </module>
62
+ <module name="GenericWhitespace"/>
63
+ <module name="WhitespaceAfter"/>
64
+ <module name="NoWhitespaceBefore"/>
65
+
66
+ <module name="UpperEll"/>
67
+ <module name="DefaultComesLast"/>
68
+ <module name="ArrayTypeStyle"/>
69
+ <module name="MultipleVariableDeclarations"/>
70
+ <module name="ModifierOrder"/>
71
+ <module name="OneStatementPerLine"/>
72
+ <module name="StringLiteralEquality"/>
73
+ <module name="MutableException"/>
74
+ <module name="EqualsHashCode"/>
75
+ <module name="InnerAssignment"/>
76
+ <module name="InterfaceIsType"/>
77
+ <module name="HideUtilityClassConstructor"/>
78
+
79
+ <module name="MemberName"/>
80
+ <module name="LocalVariableName"/>
81
+ <module name="LocalFinalVariableName"/>
82
+ <module name="TypeName"/>
83
+ <module name="PackageName"/>
84
+ <module name="ParameterName"/>
85
+ <module name="StaticVariableName"/>
86
+ <module name="ClassTypeParameterName">
87
+ <property name="format" value="^[A-Z][0-9]?$"/>
88
+ </module>
89
+ <module name="MethodTypeParameterName">
90
+ <property name="format" value="^[A-Z][0-9]?$"/>
91
+ </module>
92
+
93
+ <module name="WhitespaceAround">
94
+ <property name="allowEmptyConstructors" value="true"/>
95
+ <property name="allowEmptyMethods" value="true"/>
96
+ <property name="ignoreEnhancedForColon" value="false"/>
97
+ <property name="tokens" value="
98
+ ASSIGN, BAND, BAND_ASSIGN, BOR, BOR_ASSIGN, BSR, BSR_ASSIGN,
99
+ BXOR, BXOR_ASSIGN, COLON, DIV, DIV_ASSIGN, EQUAL, GE, GT, LAND, LE,
100
+ LITERAL_ASSERT, LITERAL_CATCH, LITERAL_DO, LITERAL_ELSE,
101
+ LITERAL_FINALLY, LITERAL_FOR, LITERAL_IF, LITERAL_RETURN,
102
+ LITERAL_SYNCHRONIZED, LITERAL_TRY, LITERAL_WHILE,
103
+ LOR, LT, MINUS, MINUS_ASSIGN, MOD, MOD_ASSIGN, NOT_EQUAL,
104
+ PLUS, PLUS_ASSIGN, QUESTION, SL, SLIST, SL_ASSIGN, SR, SR_ASSIGN,
105
+ STAR, STAR_ASSIGN, TYPE_EXTENSION_AND"/>
106
+ </module>
107
+ </module>
108
+ </module>
Binary file
@@ -0,0 +1,6 @@
1
+ #Wed Jan 13 12:41:02 JST 2016
2
+ distributionBase=GRADLE_USER_HOME
3
+ distributionPath=wrapper/dists
4
+ zipStoreBase=GRADLE_USER_HOME
5
+ zipStorePath=wrapper/dists
6
+ distributionUrl=https\://services.gradle.org/distributions/gradle-2.10-bin.zip
data/gradlew ADDED
@@ -0,0 +1,160 @@
1
+ #!/usr/bin/env bash
2
+
3
+ ##############################################################################
4
+ ##
5
+ ## Gradle start up script for UN*X
6
+ ##
7
+ ##############################################################################
8
+
9
+ # Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
10
+ DEFAULT_JVM_OPTS=""
11
+
12
+ APP_NAME="Gradle"
13
+ APP_BASE_NAME=`basename "$0"`
14
+
15
+ # Use the maximum available, or set MAX_FD != -1 to use that value.
16
+ MAX_FD="maximum"
17
+
18
+ warn ( ) {
19
+ echo "$*"
20
+ }
21
+
22
+ die ( ) {
23
+ echo
24
+ echo "$*"
25
+ echo
26
+ exit 1
27
+ }
28
+
29
+ # OS specific support (must be 'true' or 'false').
30
+ cygwin=false
31
+ msys=false
32
+ darwin=false
33
+ case "`uname`" in
34
+ CYGWIN* )
35
+ cygwin=true
36
+ ;;
37
+ Darwin* )
38
+ darwin=true
39
+ ;;
40
+ MINGW* )
41
+ msys=true
42
+ ;;
43
+ esac
44
+
45
+ # Attempt to set APP_HOME
46
+ # Resolve links: $0 may be a link
47
+ PRG="$0"
48
+ # Need this for relative symlinks.
49
+ while [ -h "$PRG" ] ; do
50
+ ls=`ls -ld "$PRG"`
51
+ link=`expr "$ls" : '.*-> \(.*\)$'`
52
+ if expr "$link" : '/.*' > /dev/null; then
53
+ PRG="$link"
54
+ else
55
+ PRG=`dirname "$PRG"`"/$link"
56
+ fi
57
+ done
58
+ SAVED="`pwd`"
59
+ cd "`dirname \"$PRG\"`/" >/dev/null
60
+ APP_HOME="`pwd -P`"
61
+ cd "$SAVED" >/dev/null
62
+
63
+ CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar
64
+
65
+ # Determine the Java command to use to start the JVM.
66
+ if [ -n "$JAVA_HOME" ] ; then
67
+ if [ -x "$JAVA_HOME/jre/sh/java" ] ; then
68
+ # IBM's JDK on AIX uses strange locations for the executables
69
+ JAVACMD="$JAVA_HOME/jre/sh/java"
70
+ else
71
+ JAVACMD="$JAVA_HOME/bin/java"
72
+ fi
73
+ if [ ! -x "$JAVACMD" ] ; then
74
+ die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME
75
+
76
+ Please set the JAVA_HOME variable in your environment to match the
77
+ location of your Java installation."
78
+ fi
79
+ else
80
+ JAVACMD="java"
81
+ which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
82
+
83
+ Please set the JAVA_HOME variable in your environment to match the
84
+ location of your Java installation."
85
+ fi
86
+
87
+ # Increase the maximum file descriptors if we can.
88
+ if [ "$cygwin" = "false" -a "$darwin" = "false" ] ; then
89
+ MAX_FD_LIMIT=`ulimit -H -n`
90
+ if [ $? -eq 0 ] ; then
91
+ if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then
92
+ MAX_FD="$MAX_FD_LIMIT"
93
+ fi
94
+ ulimit -n $MAX_FD
95
+ if [ $? -ne 0 ] ; then
96
+ warn "Could not set maximum file descriptor limit: $MAX_FD"
97
+ fi
98
+ else
99
+ warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT"
100
+ fi
101
+ fi
102
+
103
+ # For Darwin, add options to specify how the application appears in the dock
104
+ if $darwin; then
105
+ GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\""
106
+ fi
107
+
108
+ # For Cygwin, switch paths to Windows format before running java
109
+ if $cygwin ; then
110
+ APP_HOME=`cygpath --path --mixed "$APP_HOME"`
111
+ CLASSPATH=`cygpath --path --mixed "$CLASSPATH"`
112
+ JAVACMD=`cygpath --unix "$JAVACMD"`
113
+
114
+ # We build the pattern for arguments to be converted via cygpath
115
+ ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null`
116
+ SEP=""
117
+ for dir in $ROOTDIRSRAW ; do
118
+ ROOTDIRS="$ROOTDIRS$SEP$dir"
119
+ SEP="|"
120
+ done
121
+ OURCYGPATTERN="(^($ROOTDIRS))"
122
+ # Add a user-defined pattern to the cygpath arguments
123
+ if [ "$GRADLE_CYGPATTERN" != "" ] ; then
124
+ OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)"
125
+ fi
126
+ # Now convert the arguments - kludge to limit ourselves to /bin/sh
127
+ i=0
128
+ for arg in "$@" ; do
129
+ CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -`
130
+ CHECK2=`echo "$arg"|egrep -c "^-"` ### Determine if an option
131
+
132
+ if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then ### Added a condition
133
+ eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"`
134
+ else
135
+ eval `echo args$i`="\"$arg\""
136
+ fi
137
+ i=$((i+1))
138
+ done
139
+ case $i in
140
+ (0) set -- ;;
141
+ (1) set -- "$args0" ;;
142
+ (2) set -- "$args0" "$args1" ;;
143
+ (3) set -- "$args0" "$args1" "$args2" ;;
144
+ (4) set -- "$args0" "$args1" "$args2" "$args3" ;;
145
+ (5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;;
146
+ (6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;;
147
+ (7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;;
148
+ (8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;;
149
+ (9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;;
150
+ esac
151
+ fi
152
+
153
+ # Split up the JVM_OPTS And GRADLE_OPTS values into an array, following the shell quoting and substitution rules
154
+ function splitJvmOpts() {
155
+ JVM_OPTS=("$@")
156
+ }
157
+ eval splitJvmOpts $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS
158
+ JVM_OPTS[${#JVM_OPTS[*]}]="-Dorg.gradle.appname=$APP_BASE_NAME"
159
+
160
+ exec "$JAVACMD" "${JVM_OPTS[@]}" -classpath "$CLASSPATH" org.gradle.wrapper.GradleWrapperMain "$@"
data/gradlew.bat ADDED
@@ -0,0 +1,90 @@
1
+ @if "%DEBUG%" == "" @echo off
2
+ @rem ##########################################################################
3
+ @rem
4
+ @rem Gradle startup script for Windows
5
+ @rem
6
+ @rem ##########################################################################
7
+
8
+ @rem Set local scope for the variables with windows NT shell
9
+ if "%OS%"=="Windows_NT" setlocal
10
+
11
+ @rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
12
+ set DEFAULT_JVM_OPTS=
13
+
14
+ set DIRNAME=%~dp0
15
+ if "%DIRNAME%" == "" set DIRNAME=.
16
+ set APP_BASE_NAME=%~n0
17
+ set APP_HOME=%DIRNAME%
18
+
19
+ @rem Find java.exe
20
+ if defined JAVA_HOME goto findJavaFromJavaHome
21
+
22
+ set JAVA_EXE=java.exe
23
+ %JAVA_EXE% -version >NUL 2>&1
24
+ if "%ERRORLEVEL%" == "0" goto init
25
+
26
+ echo.
27
+ echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
28
+ echo.
29
+ echo Please set the JAVA_HOME variable in your environment to match the
30
+ echo location of your Java installation.
31
+
32
+ goto fail
33
+
34
+ :findJavaFromJavaHome
35
+ set JAVA_HOME=%JAVA_HOME:"=%
36
+ set JAVA_EXE=%JAVA_HOME%/bin/java.exe
37
+
38
+ if exist "%JAVA_EXE%" goto init
39
+
40
+ echo.
41
+ echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
42
+ echo.
43
+ echo Please set the JAVA_HOME variable in your environment to match the
44
+ echo location of your Java installation.
45
+
46
+ goto fail
47
+
48
+ :init
49
+ @rem Get command-line arguments, handling Windowz variants
50
+
51
+ if not "%OS%" == "Windows_NT" goto win9xME_args
52
+ if "%@eval[2+2]" == "4" goto 4NT_args
53
+
54
+ :win9xME_args
55
+ @rem Slurp the command line arguments.
56
+ set CMD_LINE_ARGS=
57
+ set _SKIP=2
58
+
59
+ :win9xME_args_slurp
60
+ if "x%~1" == "x" goto execute
61
+
62
+ set CMD_LINE_ARGS=%*
63
+ goto execute
64
+
65
+ :4NT_args
66
+ @rem Get arguments from the 4NT Shell from JP Software
67
+ set CMD_LINE_ARGS=%$
68
+
69
+ :execute
70
+ @rem Setup the command line
71
+
72
+ set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
73
+
74
+ @rem Execute Gradle
75
+ "%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS%
76
+
77
+ :end
78
+ @rem End local scope for the variables with windows NT shell
79
+ if "%ERRORLEVEL%"=="0" goto mainEnd
80
+
81
+ :fail
82
+ rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
83
+ rem the _cmd.exe /c_ return code!
84
+ if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1
85
+ exit /b 1
86
+
87
+ :mainEnd
88
+ if "%OS%"=="Windows_NT" endlocal
89
+
90
+ :omega
@@ -0,0 +1,3 @@
1
+ Embulk::JavaPlugin.register_filter(
2
+ "crawler", "org.embulk.filter.crawler.CrawlerFilterPlugin",
3
+ File.expand_path('../../../../classpath', __FILE__))
@@ -0,0 +1,21 @@
1
+ package org.embulk.filter.crawler;
2
+
3
+ public class Constants
4
+ {
5
+
6
+ public static final String URL = "url";
7
+ public static final String DOMAIN = "domain";
8
+ public static final String SUBDOMAIN = "subdomain";
9
+ public static final String PATH = "path";
10
+ public static final String ANCHOR = "anchor";
11
+ public static final String PARENT_URL = "parent_url";
12
+ public static final String CONTENT_CHARSET = "content_charset";
13
+ public static final String REDIRECT_TO_URL = "redirect_to_url";
14
+ public static final String LANGUAGE = "language";
15
+ public static final String STATUS_CODE = "status_code";
16
+ public static final String TITLE = "title";
17
+ public static final String TEXT = "text";
18
+ public static final String HTML = "html";
19
+
20
+ private Constants(){}
21
+ }
@@ -0,0 +1,38 @@
1
+ /**
2
+ * Licensed to the Apache Software Foundation (ASF) under one or more
3
+ * contributor license agreements. See the NOTICE file distributed with
4
+ * this work for additional information regarding copyright ownership.
5
+ * The ASF licenses this file to You under the Apache License, Version 2.0
6
+ * (the "License"); you may not use this file except in compliance with
7
+ * the License. You may obtain a copy of the License at
8
+ *
9
+ * http://www.apache.org/licenses/LICENSE-2.0
10
+ *
11
+ * Unless required by applicable law or agreed to in writing, software
12
+ * distributed under the License is distributed on an "AS IS" BASIS,
13
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ * See the License for the specific language governing permissions and
15
+ * limitations under the License.
16
+ */
17
+
18
+ package org.embulk.filter.crawler;
19
+
20
+ import java.util.List;
21
+ import java.util.Map;
22
+
23
+ import com.google.common.collect.Lists;
24
+
25
+ public class CrawlStat
26
+ {
27
+ List<Map<String, Object>> pages = Lists.newArrayList();
28
+
29
+ public List<Map<String, Object>> getPages()
30
+ {
31
+ return pages;
32
+ }
33
+
34
+ public void setPages(List<Map<String, Object>> pages)
35
+ {
36
+ this.pages = pages;
37
+ }
38
+ }
@@ -0,0 +1,221 @@
1
+ package org.embulk.filter.crawler;
2
+
3
+ import java.io.File;
4
+ import java.util.List;
5
+ import java.util.Map;
6
+ import java.util.UUID;
7
+
8
+ import org.embulk.config.Config;
9
+ import org.embulk.config.ConfigDefault;
10
+ import org.embulk.config.ConfigSource;
11
+ import org.embulk.config.Task;
12
+ import org.embulk.config.TaskSource;
13
+ import org.embulk.spi.Column;
14
+ import org.embulk.spi.Exec;
15
+ import org.embulk.spi.FilterPlugin;
16
+ import org.embulk.spi.Page;
17
+ import org.embulk.spi.PageBuilder;
18
+ import org.embulk.spi.PageOutput;
19
+ import org.embulk.spi.PageReader;
20
+ import org.embulk.spi.Schema;
21
+ import org.embulk.spi.type.Types;
22
+
23
+ import com.google.common.base.Optional;
24
+ import com.google.common.collect.ImmutableList;
25
+ import com.google.common.collect.Lists;
26
+ import com.google.common.collect.Maps;
27
+
28
+ import edu.uci.ics.crawler4j.crawler.CrawlConfig;
29
+ import edu.uci.ics.crawler4j.crawler.CrawlController;
30
+ import edu.uci.ics.crawler4j.fetcher.PageFetcher;
31
+ import edu.uci.ics.crawler4j.robotstxt.RobotstxtConfig;
32
+ import edu.uci.ics.crawler4j.robotstxt.RobotstxtServer;
33
+
34
+ public class CrawlerFilterPlugin
35
+ implements FilterPlugin
36
+ {
37
+ public interface PluginTask
38
+ extends Task
39
+ {
40
+ @Config("max_depth_of_crawling")
41
+ @ConfigDefault("null")
42
+ public Optional<Integer> getMaxDepthOfCrawling();
43
+
44
+ @Config("number_of_crawlers")
45
+ @ConfigDefault("1")
46
+ public int getNumberOfCrawlers();
47
+
48
+ @Config("max_pages_to_fetch")
49
+ @ConfigDefault("-1")
50
+ public int getMaxPagesToFetch();
51
+
52
+ @Config("target_key")
53
+ public String getTargetKey();
54
+
55
+ @Config("crawl_storage_folder")
56
+ public String getCrawlStorageFolder();
57
+
58
+ @Config("politeness_delay")
59
+ @ConfigDefault("null")
60
+ public Optional<Integer> getPolitenessDelay();
61
+
62
+ @Config("user_agent_string")
63
+ @ConfigDefault("null")
64
+ public Optional<String> getUserAgentString();
65
+
66
+ @Config("keep_input")
67
+ @ConfigDefault("true")
68
+ public boolean getKeepInput();
69
+
70
+ @Config("output_prefix")
71
+ @ConfigDefault("\"\"")
72
+ public String getOutputPrefix();
73
+
74
+ @Config("should_not_visit_pattern")
75
+ @ConfigDefault("null")
76
+ public Optional<String> getShouldNotVisitPattern();
77
+ }
78
+
79
+ @Override
80
+ public void transaction(ConfigSource config, Schema inputSchema,
81
+ FilterPlugin.Control control)
82
+ {
83
+ PluginTask task = config.loadConfig(PluginTask.class);
84
+ ImmutableList.Builder<Column> builder = ImmutableList.builder();
85
+
86
+ int i = 0;
87
+ builder.addAll(getOutputColumns(i, task.getOutputPrefix()));
88
+
89
+ Schema outputSchema = new Schema(builder.build());
90
+ control.run(task.dump(), outputSchema);
91
+ }
92
+
93
+ /**
94
+ * @param i
95
+ * @param outputPrefix
96
+ * @return
97
+ */
98
+ private List<Column> getOutputColumns(int i, String outputPrefix) {
99
+ List<Column> list = Lists.newArrayList();
100
+ list.add(new Column(i++, outputPrefix + Constants.URL, Types.STRING));
101
+ list.add(new Column(i++, outputPrefix + Constants.DOMAIN, Types.STRING));
102
+ list.add(new Column(i++, outputPrefix + Constants.SUBDOMAIN, Types.STRING));
103
+ list.add(new Column(i++, outputPrefix + Constants.PATH, Types.STRING));
104
+ list.add(new Column(i++, outputPrefix + Constants.ANCHOR, Types.STRING));
105
+ list.add(new Column(i++, outputPrefix + Constants.PARENT_URL, Types.STRING));
106
+ list.add(new Column(i++, outputPrefix + Constants.CONTENT_CHARSET, Types.STRING));
107
+ list.add(new Column(i++, outputPrefix + Constants.REDIRECT_TO_URL, Types.STRING));
108
+ list.add(new Column(i++, outputPrefix + Constants.LANGUAGE, Types.STRING));
109
+ list.add(new Column(i++, outputPrefix + Constants.STATUS_CODE, Types.LONG));
110
+ list.add(new Column(i++, outputPrefix + Constants.TITLE, Types.STRING));
111
+ list.add(new Column(i++, outputPrefix + Constants.TEXT, Types.STRING));
112
+ list.add(new Column(i++, outputPrefix + Constants.HTML, Types.STRING));
113
+ return list;
114
+ }
115
+
116
+ @Override
117
+ public PageOutput open(TaskSource taskSource, final Schema inputSchema,
118
+ final Schema outputSchema, final PageOutput output)
119
+ {
120
+ final PluginTask task = taskSource.loadTask(PluginTask.class);
121
+ final Column keyNameColumn = inputSchema.lookupColumn(task.getTargetKey());
122
+
123
+ return new PageOutput() {
124
+ private PageReader reader = new PageReader(inputSchema);
125
+ private PageBuilder builder = new PageBuilder(Exec.getBufferAllocator(), outputSchema, output);
126
+ private CrawlController controller = getController();
127
+
128
+ @Override
129
+ public void finish()
130
+ {
131
+ for (Object object : controller.getCrawlersLocalData()) {
132
+ CrawlStat crawlStat = (CrawlStat) object;
133
+ for (Map<String, Object> map : crawlStat.getPages()) {
134
+ for (Column outputColumn : outputSchema.getColumns()) {
135
+ final Object value = map.get(outputColumn.getName());
136
+ setValue(value, outputColumn);
137
+ }
138
+ builder.addRecord();
139
+ }
140
+ }
141
+ builder.finish();
142
+ }
143
+
144
+ @Override
145
+ public void close()
146
+ {
147
+ builder.close();
148
+ }
149
+
150
+ @Override
151
+ public void add(Page page)
152
+ {
153
+ reader.setPage(page);
154
+ while (reader.nextRecord()) {
155
+ controller.addSeed(reader.getString(keyNameColumn));
156
+ }
157
+ Map<String, Object> customData = Maps.newHashMap();
158
+ customData.put("output_prefix", task.getOutputPrefix());
159
+ if (task.getShouldNotVisitPattern().isPresent()) {
160
+ customData.put("should_not_visit_pattern", task.getShouldNotVisitPattern().get());
161
+ }
162
+ controller.setCustomData(customData);
163
+ controller.start(EmbulkCrawler.class, task.getNumberOfCrawlers());
164
+ }
165
+
166
+ /**
167
+ * @param seeds
168
+ */
169
+ private void setValue(Object value, Column column)
170
+ {
171
+ if (value == null) {
172
+ builder.setNull(column);
173
+ }
174
+ else if (column.getType().equals(Types.STRING)) {
175
+ builder.setString(column, (String) value);
176
+ }
177
+ else if (column.getType().equals(Types.LONG)) {
178
+ builder.setLong(column, (Integer) value);
179
+ }
180
+ }
181
+
182
+ /**
183
+ * @param seeds
184
+ * @return
185
+ */
186
+ private CrawlController getController()
187
+ {
188
+ CrawlConfig config = new CrawlConfig();
189
+ String directoryPath = String.format(task.getCrawlStorageFolder(), UUID.randomUUID());
190
+ File dir = new File(directoryPath);
191
+ dir.mkdirs();
192
+
193
+ config.setCrawlStorageFolder(directoryPath);
194
+ if (task.getMaxDepthOfCrawling().isPresent()) {
195
+ config.setMaxDepthOfCrawling(task.getMaxDepthOfCrawling().get());
196
+ }
197
+
198
+ config.setMaxPagesToFetch(task.getMaxPagesToFetch());
199
+ if (task.getPolitenessDelay().isPresent()) {
200
+ config.setPolitenessDelay(task.getPolitenessDelay().get());
201
+ }
202
+ if (task.getUserAgentString().isPresent()) {
203
+ config.setUserAgentString(task.getUserAgentString().get());
204
+ }
205
+
206
+ PageFetcher pageFetcher = new PageFetcher(config);
207
+ RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
208
+ RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
209
+ CrawlController crawlController = null;
210
+ try {
211
+ crawlController = new CrawlController(config, pageFetcher, robotstxtServer);
212
+ }
213
+ catch (Exception e) {
214
+ e.printStackTrace();
215
+ }
216
+
217
+ return crawlController;
218
+ }
219
+ };
220
+ }
221
+ }
@@ -0,0 +1,103 @@
1
+ package org.embulk.filter.crawler;
2
+
3
+ import java.util.Map;
4
+ import java.util.regex.Pattern;
5
+
6
+ import com.google.common.collect.Maps;
7
+
8
+ import edu.uci.ics.crawler4j.crawler.Page;
9
+ import edu.uci.ics.crawler4j.crawler.WebCrawler;
10
+ import edu.uci.ics.crawler4j.parser.BinaryParseData;
11
+ import edu.uci.ics.crawler4j.parser.HtmlParseData;
12
+ import edu.uci.ics.crawler4j.parser.ParseData;
13
+ import edu.uci.ics.crawler4j.parser.TextParseData;
14
+ import edu.uci.ics.crawler4j.url.WebURL;
15
+
16
+ public class EmbulkCrawler extends WebCrawler
17
+ {
18
+
19
+ CrawlStat myCrawlStat;
20
+ Pattern IMAGE_EXTENSIONS = null;
21
+
22
+ public EmbulkCrawler()
23
+ {
24
+ myCrawlStat = new CrawlStat();
25
+ }
26
+
27
+ private Map<String, Object> params;
28
+
29
+ @SuppressWarnings("unchecked")
30
+ @Override
31
+ public void onStart()
32
+ {
33
+ params = (Map<String, Object>) myController.getCustomData();
34
+ Object regex = params.get("should_not_visit_pattern");
35
+ if (regex != null) {
36
+ IMAGE_EXTENSIONS = Pattern.compile((String)regex);
37
+ }
38
+ }
39
+
40
+ /**
41
+ * You should implement this function to specify whether the given url
42
+ * should be crawled or not (based on your crawling logic).
43
+ */
44
+ @Override
45
+ public boolean shouldVisit(Page referringPage, WebURL url)
46
+ {
47
+ String href = url.getURL().toLowerCase();
48
+ if (IMAGE_EXTENSIONS != null) {
49
+ if (IMAGE_EXTENSIONS.matcher(href).matches()) {
50
+ return false;
51
+ }
52
+ }
53
+
54
+ return url.getDomain().equals(referringPage.getWebURL().getDomain());
55
+ }
56
+
57
+ /**
58
+ * This function is called when a page is fetched and ready to be processed
59
+ * by your program.
60
+ */
61
+ @Override
62
+ public void visit(Page page)
63
+ {
64
+ final WebURL webURL = page.getWebURL();
65
+ final String outputPrefix = (String) params.get("output_prefix");
66
+
67
+ Map<String, Object> map = Maps.newHashMap();
68
+ map.put(outputPrefix + Constants.URL, webURL.getURL());
69
+ map.put(outputPrefix + Constants.DOMAIN, webURL.getDomain());
70
+ map.put(outputPrefix + Constants.SUBDOMAIN, webURL.getSubDomain());
71
+ map.put(outputPrefix + Constants.PATH, webURL.getPath());
72
+ map.put(outputPrefix + Constants.ANCHOR, webURL.getAnchor());
73
+ map.put(outputPrefix + Constants.PARENT_URL, webURL.getParentUrl());
74
+ map.put(outputPrefix + Constants.CONTENT_CHARSET, page.getContentCharset());
75
+ map.put(outputPrefix + Constants.REDIRECT_TO_URL, page.getRedirectedToUrl());
76
+ map.put(outputPrefix + Constants.LANGUAGE, page.getLanguage());
77
+ map.put(outputPrefix + Constants.STATUS_CODE, page.getStatusCode());
78
+
79
+ ParseData parseData = page.getParseData();
80
+ if (parseData instanceof HtmlParseData) {
81
+ HtmlParseData htmlParseData = (HtmlParseData) parseData;
82
+ map.put(outputPrefix + Constants.TITLE, htmlParseData.getTitle());
83
+ map.put(outputPrefix + Constants.TEXT, htmlParseData.getText());
84
+ map.put(outputPrefix + Constants.HTML, htmlParseData.getHtml());
85
+ }
86
+ else if (parseData instanceof TextParseData) {
87
+ TextParseData textParseData = (TextParseData) parseData;
88
+ map.put(outputPrefix + Constants.TEXT, textParseData.getTextContent());
89
+ }
90
+ else if (parseData instanceof BinaryParseData) {
91
+ BinaryParseData binaryParseData = (BinaryParseData) parseData;
92
+ map.put(outputPrefix + Constants.HTML, binaryParseData.getHtml());
93
+ }
94
+ logger.info("{}", webURL.getURL());
95
+ myCrawlStat.pages.add(map);
96
+ }
97
+
98
+ @Override
99
+ public Object getMyLocalData()
100
+ {
101
+ return myCrawlStat;
102
+ }
103
+ }
@@ -0,0 +1,5 @@
1
+ package org.embulk.filter.crawler;
2
+
3
+ public class TestCrawlerFilterPlugin
4
+ {
5
+ }
metadata ADDED
@@ -0,0 +1,131 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: embulk-filter-crawler
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - toyama0919
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2016-03-25 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ requirement: !ruby/object:Gem::Requirement
15
+ requirements:
16
+ - - ~>
17
+ - !ruby/object:Gem::Version
18
+ version: '1.0'
19
+ name: bundler
20
+ prerelease: false
21
+ type: :development
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ~>
25
+ - !ruby/object:Gem::Version
26
+ version: '1.0'
27
+ - !ruby/object:Gem::Dependency
28
+ requirement: !ruby/object:Gem::Requirement
29
+ requirements:
30
+ - - '>='
31
+ - !ruby/object:Gem::Version
32
+ version: '10.0'
33
+ name: rake
34
+ prerelease: false
35
+ type: :development
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - '>='
39
+ - !ruby/object:Gem::Version
40
+ version: '10.0'
41
+ description: Crawler4J filter plugin for Embulk
42
+ email:
43
+ - toyama0919@gmail.com
44
+ executables: []
45
+ extensions: []
46
+ extra_rdoc_files: []
47
+ files:
48
+ - .gitignore
49
+ - LICENSE.txt
50
+ - README.md
51
+ - build.gradle
52
+ - config/checkstyle/checkstyle.xml
53
+ - config/checkstyle/default.xml
54
+ - gradle/wrapper/gradle-wrapper.jar
55
+ - gradle/wrapper/gradle-wrapper.properties
56
+ - gradlew
57
+ - gradlew.bat
58
+ - lib/embulk/filter/crawler.rb
59
+ - src/main/java/org/embulk/filter/crawler/Constants.java
60
+ - src/main/java/org/embulk/filter/crawler/CrawlStat.java
61
+ - src/main/java/org/embulk/filter/crawler/CrawlerFilterPlugin.java
62
+ - src/main/java/org/embulk/filter/crawler/EmbulkCrawler.java
63
+ - src/test/java/org/embulk/filter/crawler/TestCrawlerFilterPlugin.java
64
+ - classpath/apache-mime4j-core-0.7.2.jar
65
+ - classpath/apache-mime4j-dom-0.7.2.jar
66
+ - classpath/asm-debug-all-4.1.jar
67
+ - classpath/aspectjrt-1.6.11.jar
68
+ - classpath/bcmail-jdk15-1.45.jar
69
+ - classpath/bcprov-jdk15-1.45.jar
70
+ - classpath/boilerpipe-1.1.0.jar
71
+ - classpath/commons-codec-1.9.jar
72
+ - classpath/commons-compress-1.5.jar
73
+ - classpath/commons-logging-1.2.jar
74
+ - classpath/crawler4j-4.2.jar
75
+ - classpath/dom4j-1.6.1.jar
76
+ - classpath/embulk-filter-crawler-0.1.0.jar
77
+ - classpath/fontbox-1.8.4.jar
78
+ - classpath/geronimo-stax-api_1.0_spec-1.0.1.jar
79
+ - classpath/httpclient-4.4.jar
80
+ - classpath/httpcore-4.4.jar
81
+ - classpath/isoparser-1.0-RC-1.jar
82
+ - classpath/jdom-1.0.jar
83
+ - classpath/je-5.0.73.jar
84
+ - classpath/jempbox-1.8.4.jar
85
+ - classpath/jhighlight-1.0.jar
86
+ - classpath/juniversalchardet-1.0.3.jar
87
+ - classpath/lidalia-slf4j-ext-1.0.0.jar
88
+ - classpath/metadata-extractor-2.6.2.jar
89
+ - classpath/netcdf-4.2-min.jar
90
+ - classpath/pdfbox-1.8.4.jar
91
+ - classpath/poi-3.10-beta2.jar
92
+ - classpath/poi-ooxml-3.10-beta2.jar
93
+ - classpath/poi-ooxml-schemas-3.10-beta2.jar
94
+ - classpath/poi-scratchpad-3.10-beta2.jar
95
+ - classpath/rome-0.9.jar
96
+ - classpath/tagsoup-1.2.1.jar
97
+ - classpath/tika-core-1.5.jar
98
+ - classpath/tika-parsers-1.5.jar
99
+ - classpath/vorbis-java-core-0.1-tests.jar
100
+ - classpath/vorbis-java-core-0.1.jar
101
+ - classpath/vorbis-java-tika-0.1.jar
102
+ - classpath/xercesImpl-2.8.1.jar
103
+ - classpath/xml-apis-1.3.03.jar
104
+ - classpath/xmlbeans-2.3.0.jar
105
+ - classpath/xmpcore-5.1.2.jar
106
+ - classpath/xz-1.2.jar
107
+ homepage: https://github.com/toyama0919/embulk-filter-crawler
108
+ licenses:
109
+ - MIT
110
+ metadata: {}
111
+ post_install_message:
112
+ rdoc_options: []
113
+ require_paths:
114
+ - lib
115
+ required_ruby_version: !ruby/object:Gem::Requirement
116
+ requirements:
117
+ - - '>='
118
+ - !ruby/object:Gem::Version
119
+ version: '0'
120
+ required_rubygems_version: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - '>='
123
+ - !ruby/object:Gem::Version
124
+ version: '0'
125
+ requirements: []
126
+ rubyforge_project:
127
+ rubygems_version: 2.1.9
128
+ signing_key:
129
+ specification_version: 4
130
+ summary: Crawler4J filter plugin for Embulk
131
+ test_files: []